clarena.cl_datasets.split_tinyimagenet
The submodule in cl_datasets for Split TinyImageNet dataset.
1r""" 2The submodule in `cl_datasets` for Split TinyImageNet dataset. 3""" 4 5__all__ = ["SplitTinyImageNet"] 6 7import logging 8from typing import Callable 9 10import torch 11from tinyimagenet import TinyImageNet 12from torch.utils.data import Dataset, random_split 13from torchvision import transforms 14from torchvision.datasets import ImageFolder 15 16from clarena.cl_datasets import CLSplitDataset 17 18# always get logger for built-in logging in each module 19pylogger = logging.getLogger(__name__) 20 21 22class SplitTinyImageNet(CLSplitDataset): 23 r"""Split TinyImageNet dataset. The [TinyImageNet dataset](http://vision.stanford.edu/teaching/cs231n/reports/2015/pdfs/yle_project.pdf) is smaller, more manageable version of the [Imagenet dataset](https://www.image-net.org). It consists of 100,000 training, 10,000 validation and 10,000 test images of 200 classes, each 64x64 color image.""" 24 25 original_dataset_python_class: type[Dataset] = TinyImageNet 26 r"""The original dataset class.""" 27 28 def __init__( 29 self, 30 root: str, 31 class_split: dict[int, list[int]], 32 validation_percentage: float, 33 batch_size: int | dict[int, int] = 1, 34 num_workers: int | dict[int, int] = 0, 35 custom_transforms: ( 36 Callable 37 | transforms.Compose 38 | None 39 | dict[int, Callable | transforms.Compose | None] 40 ) = None, 41 repeat_channels: int | None | dict[int, int | None] = None, 42 to_tensor: bool | dict[int, bool] = True, 43 resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, 44 ) -> None: 45 r""" 46 **Args:** 47 - **root** (`str`): the root directory where the original TinyImageNet data 'tiny-imagenet-200/' live. 48 - **class_split** (`dict[int, list[int]]`): the dict of classes for each task. The keys are task IDs ane the values are lists of class labels (integers starting from 0) to split for each task. 49 - **validation_percentage** (`float`): The percentage to randomly split some training data into validation data. 50 - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders. 51 If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks. 52 - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders. 53 If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks. 54 - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included. 55 If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied. 56 - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat. 57 If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied. 58 - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`. 59 If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks. 60 - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize. 61 If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied. 62 """ 63 64 super().__init__( 65 root=root, 66 class_split=class_split, 67 batch_size=batch_size, 68 num_workers=num_workers, 69 custom_transforms=custom_transforms, 70 repeat_channels=repeat_channels, 71 to_tensor=to_tensor, 72 resize=resize, 73 ) 74 75 self.validation_percentage: float = validation_percentage 76 r"""The percentage to randomly split some training data into validation data.""" 77 78 def prepare_data(self) -> None: 79 r"""Download the original TinyImagenet dataset if haven't.""" 80 81 if self.task_id != 1: 82 return # download all original datasets only at the beginning of first task 83 84 TinyImageNet(self.root_t) 85 86 pylogger.debug( 87 "The original TinyImageNet dataset has been downloaded to %s.", 88 self.root_t, 89 ) 90 91 def get_subset_of_classes(self, dataset: ImageFolder) -> ImageFolder: 92 r"""Get a subset of classes from the dataset of current classes of `self.task_id`. It is used when constructing the split. 93 94 **Args:** 95 - **dataset** (`ImageFolder`): the dataset to retrieve subset from. 96 97 **Returns:** 98 - **subset** (`ImageFolder`): the subset of classes from the dataset. 99 """ 100 classes = self.class_split[self.task_id - 1] 101 102 # get the indices of the dataset that belong to the classes 103 idx = [i for i, (_, target) in enumerate(dataset) if target in classes] 104 105 # subset the dataset by the indices, in-place operation 106 dataset.samples = [dataset.samples[i] for i in idx] # samples is a list 107 dataset.targets = [dataset.targets[i] for i in idx] # targets is a list 108 109 dataset.target_transform = ( 110 self.target_transform() 111 ) # cl class mapping should be applied after the split 112 113 return dataset 114 115 def train_and_val_dataset(self) -> Dataset: 116 r"""Get the training and validation dataset of task `self.task_id`. 117 118 **Returns:** 119 - **train_dataset** (`Dataset`): the training dataset of task `self.task_id`. 120 - **val_dataset** (`Dataset`): the validation dataset of task `self.task_id`. 121 """ 122 dataset_train_and_val = self.get_subset_of_classes( 123 TinyImageNet( 124 root=self.root_t, 125 split="train", 126 transform=self.train_and_val_transforms(), 127 # cl class mapping should be applied after the split 128 ) 129 ) 130 131 return random_split( 132 dataset_train_and_val, 133 lengths=[1 - self.validation_percentage, self.validation_percentage], 134 generator=torch.Generator().manual_seed( 135 42 136 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 137 ) 138 139 def test_dataset(self) -> Dataset: 140 r"""Get the test dataset of task `self.task_id`. 141 142 **Returns:** 143 - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`. 144 """ 145 dataset_test = self.get_subset_of_classes( 146 TinyImageNet( 147 root=self.root_t, 148 split="val", 149 transform=self.train_and_val_transforms(), 150 # cl class mapping should be applied after the split 151 ) 152 ) 153 154 return dataset_test
class
SplitTinyImageNet(clarena.cl_datasets.base.CLSplitDataset):
23class SplitTinyImageNet(CLSplitDataset): 24 r"""Split TinyImageNet dataset. The [TinyImageNet dataset](http://vision.stanford.edu/teaching/cs231n/reports/2015/pdfs/yle_project.pdf) is smaller, more manageable version of the [Imagenet dataset](https://www.image-net.org). It consists of 100,000 training, 10,000 validation and 10,000 test images of 200 classes, each 64x64 color image.""" 25 26 original_dataset_python_class: type[Dataset] = TinyImageNet 27 r"""The original dataset class.""" 28 29 def __init__( 30 self, 31 root: str, 32 class_split: dict[int, list[int]], 33 validation_percentage: float, 34 batch_size: int | dict[int, int] = 1, 35 num_workers: int | dict[int, int] = 0, 36 custom_transforms: ( 37 Callable 38 | transforms.Compose 39 | None 40 | dict[int, Callable | transforms.Compose | None] 41 ) = None, 42 repeat_channels: int | None | dict[int, int | None] = None, 43 to_tensor: bool | dict[int, bool] = True, 44 resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, 45 ) -> None: 46 r""" 47 **Args:** 48 - **root** (`str`): the root directory where the original TinyImageNet data 'tiny-imagenet-200/' live. 49 - **class_split** (`dict[int, list[int]]`): the dict of classes for each task. The keys are task IDs ane the values are lists of class labels (integers starting from 0) to split for each task. 50 - **validation_percentage** (`float`): The percentage to randomly split some training data into validation data. 51 - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders. 52 If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks. 53 - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders. 54 If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks. 55 - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included. 56 If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied. 57 - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat. 58 If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied. 59 - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`. 60 If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks. 61 - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize. 62 If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied. 63 """ 64 65 super().__init__( 66 root=root, 67 class_split=class_split, 68 batch_size=batch_size, 69 num_workers=num_workers, 70 custom_transforms=custom_transforms, 71 repeat_channels=repeat_channels, 72 to_tensor=to_tensor, 73 resize=resize, 74 ) 75 76 self.validation_percentage: float = validation_percentage 77 r"""The percentage to randomly split some training data into validation data.""" 78 79 def prepare_data(self) -> None: 80 r"""Download the original TinyImagenet dataset if haven't.""" 81 82 if self.task_id != 1: 83 return # download all original datasets only at the beginning of first task 84 85 TinyImageNet(self.root_t) 86 87 pylogger.debug( 88 "The original TinyImageNet dataset has been downloaded to %s.", 89 self.root_t, 90 ) 91 92 def get_subset_of_classes(self, dataset: ImageFolder) -> ImageFolder: 93 r"""Get a subset of classes from the dataset of current classes of `self.task_id`. It is used when constructing the split. 94 95 **Args:** 96 - **dataset** (`ImageFolder`): the dataset to retrieve subset from. 97 98 **Returns:** 99 - **subset** (`ImageFolder`): the subset of classes from the dataset. 100 """ 101 classes = self.class_split[self.task_id - 1] 102 103 # get the indices of the dataset that belong to the classes 104 idx = [i for i, (_, target) in enumerate(dataset) if target in classes] 105 106 # subset the dataset by the indices, in-place operation 107 dataset.samples = [dataset.samples[i] for i in idx] # samples is a list 108 dataset.targets = [dataset.targets[i] for i in idx] # targets is a list 109 110 dataset.target_transform = ( 111 self.target_transform() 112 ) # cl class mapping should be applied after the split 113 114 return dataset 115 116 def train_and_val_dataset(self) -> Dataset: 117 r"""Get the training and validation dataset of task `self.task_id`. 118 119 **Returns:** 120 - **train_dataset** (`Dataset`): the training dataset of task `self.task_id`. 121 - **val_dataset** (`Dataset`): the validation dataset of task `self.task_id`. 122 """ 123 dataset_train_and_val = self.get_subset_of_classes( 124 TinyImageNet( 125 root=self.root_t, 126 split="train", 127 transform=self.train_and_val_transforms(), 128 # cl class mapping should be applied after the split 129 ) 130 ) 131 132 return random_split( 133 dataset_train_and_val, 134 lengths=[1 - self.validation_percentage, self.validation_percentage], 135 generator=torch.Generator().manual_seed( 136 42 137 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 138 ) 139 140 def test_dataset(self) -> Dataset: 141 r"""Get the test dataset of task `self.task_id`. 142 143 **Returns:** 144 - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`. 145 """ 146 dataset_test = self.get_subset_of_classes( 147 TinyImageNet( 148 root=self.root_t, 149 split="val", 150 transform=self.train_and_val_transforms(), 151 # cl class mapping should be applied after the split 152 ) 153 ) 154 155 return dataset_test
Split TinyImageNet dataset. The TinyImageNet dataset is smaller, more manageable version of the Imagenet dataset. It consists of 100,000 training, 10,000 validation and 10,000 test images of 200 classes, each 64x64 color image.
SplitTinyImageNet( root: str, class_split: dict[int, list[int]], validation_percentage: float, batch_size: int | dict[int, int] = 1, num_workers: int | dict[int, int] = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType, dict[int, Union[Callable, torchvision.transforms.transforms.Compose, NoneType]]] = None, repeat_channels: int | None | dict[int, int | None] = None, to_tensor: bool | dict[int, bool] = True, resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None)
29 def __init__( 30 self, 31 root: str, 32 class_split: dict[int, list[int]], 33 validation_percentage: float, 34 batch_size: int | dict[int, int] = 1, 35 num_workers: int | dict[int, int] = 0, 36 custom_transforms: ( 37 Callable 38 | transforms.Compose 39 | None 40 | dict[int, Callable | transforms.Compose | None] 41 ) = None, 42 repeat_channels: int | None | dict[int, int | None] = None, 43 to_tensor: bool | dict[int, bool] = True, 44 resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, 45 ) -> None: 46 r""" 47 **Args:** 48 - **root** (`str`): the root directory where the original TinyImageNet data 'tiny-imagenet-200/' live. 49 - **class_split** (`dict[int, list[int]]`): the dict of classes for each task. The keys are task IDs ane the values are lists of class labels (integers starting from 0) to split for each task. 50 - **validation_percentage** (`float`): The percentage to randomly split some training data into validation data. 51 - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders. 52 If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks. 53 - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders. 54 If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks. 55 - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included. 56 If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied. 57 - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat. 58 If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied. 59 - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`. 60 If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks. 61 - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize. 62 If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied. 63 """ 64 65 super().__init__( 66 root=root, 67 class_split=class_split, 68 batch_size=batch_size, 69 num_workers=num_workers, 70 custom_transforms=custom_transforms, 71 repeat_channels=repeat_channels, 72 to_tensor=to_tensor, 73 resize=resize, 74 ) 75 76 self.validation_percentage: float = validation_percentage 77 r"""The percentage to randomly split some training data into validation data."""
Args:
- root (
str): the root directory where the original TinyImageNet data 'tiny-imagenet-200/' live. - class_split (
dict[int, list[int]]): the dict of classes for each task. The keys are task IDs ane the values are lists of class labels (integers starting from 0) to split for each task. - validation_percentage (
float): The percentage to randomly split some training data into validation data. - batch_size (
int|dict[int, int]): the batch size for train, val, and test dataloaders. If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is anint, it is the same batch size for all tasks. - num_workers (
int|dict[int, int]): the number of workers for dataloaders. If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is anint, it is the same number of workers for all tasks. - custom_transforms (
transformortransforms.ComposeorNoneor dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform.ToTensor(), normalization, permute, and so on are not included. If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it isNone, no custom transforms are applied. - repeat_channels (
int|None| dict of them): the number of channels to repeat for each task. Default isNone, which means no repeat. If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is anint, it is the same number of channels to repeat for all tasks. If it isNone, no repeat is applied. - to_tensor (
bool|dict[int, bool]): whether to include theToTensor()transform. Default isTrue. If it is a dict, the keys are task IDs and the values are whether to include theToTensor()transform for each task. If it is a single boolean value, it is applied to all tasks. - resize (
tuple[int, int]|Noneor dict of them): the size to resize the images to. Default isNone, which means no resize. If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it isNone, no resize is applied.
original_dataset_python_class: type[torch.utils.data.dataset.Dataset] =
<class 'tinyimagenet.TinyImageNet'>
The original dataset class.
validation_percentage: float
The percentage to randomly split some training data into validation data.
def
prepare_data(self) -> None:
79 def prepare_data(self) -> None: 80 r"""Download the original TinyImagenet dataset if haven't.""" 81 82 if self.task_id != 1: 83 return # download all original datasets only at the beginning of first task 84 85 TinyImageNet(self.root_t) 86 87 pylogger.debug( 88 "The original TinyImageNet dataset has been downloaded to %s.", 89 self.root_t, 90 )
Download the original TinyImagenet dataset if haven't.
def
get_subset_of_classes( self, dataset: torchvision.datasets.folder.ImageFolder) -> torchvision.datasets.folder.ImageFolder:
92 def get_subset_of_classes(self, dataset: ImageFolder) -> ImageFolder: 93 r"""Get a subset of classes from the dataset of current classes of `self.task_id`. It is used when constructing the split. 94 95 **Args:** 96 - **dataset** (`ImageFolder`): the dataset to retrieve subset from. 97 98 **Returns:** 99 - **subset** (`ImageFolder`): the subset of classes from the dataset. 100 """ 101 classes = self.class_split[self.task_id - 1] 102 103 # get the indices of the dataset that belong to the classes 104 idx = [i for i, (_, target) in enumerate(dataset) if target in classes] 105 106 # subset the dataset by the indices, in-place operation 107 dataset.samples = [dataset.samples[i] for i in idx] # samples is a list 108 dataset.targets = [dataset.targets[i] for i in idx] # targets is a list 109 110 dataset.target_transform = ( 111 self.target_transform() 112 ) # cl class mapping should be applied after the split 113 114 return dataset
Get a subset of classes from the dataset of current classes of self.task_id. It is used when constructing the split.
Args:
- dataset (
ImageFolder): the dataset to retrieve subset from.
Returns:
- subset (
ImageFolder): the subset of classes from the dataset.
def
train_and_val_dataset(self) -> torch.utils.data.dataset.Dataset:
116 def train_and_val_dataset(self) -> Dataset: 117 r"""Get the training and validation dataset of task `self.task_id`. 118 119 **Returns:** 120 - **train_dataset** (`Dataset`): the training dataset of task `self.task_id`. 121 - **val_dataset** (`Dataset`): the validation dataset of task `self.task_id`. 122 """ 123 dataset_train_and_val = self.get_subset_of_classes( 124 TinyImageNet( 125 root=self.root_t, 126 split="train", 127 transform=self.train_and_val_transforms(), 128 # cl class mapping should be applied after the split 129 ) 130 ) 131 132 return random_split( 133 dataset_train_and_val, 134 lengths=[1 - self.validation_percentage, self.validation_percentage], 135 generator=torch.Generator().manual_seed( 136 42 137 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 138 )
Get the training and validation dataset of task self.task_id.
Returns:
- train_dataset (
Dataset): the training dataset of taskself.task_id. - val_dataset (
Dataset): the validation dataset of taskself.task_id.
def
test_dataset(self) -> torch.utils.data.dataset.Dataset:
140 def test_dataset(self) -> Dataset: 141 r"""Get the test dataset of task `self.task_id`. 142 143 **Returns:** 144 - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`. 145 """ 146 dataset_test = self.get_subset_of_classes( 147 TinyImageNet( 148 root=self.root_t, 149 split="val", 150 transform=self.train_and_val_transforms(), 151 # cl class mapping should be applied after the split 152 ) 153 ) 154 155 return dataset_test
Get the test dataset of task self.task_id.
Returns:
- test_dataset (
Dataset): the test dataset of taskself.task_id.