clarena.cl_datasets.split_tinyimagenet

The submodule in cl_datasets for Split TinyImageNet dataset.

  1r"""
  2The submodule in `cl_datasets` for Split TinyImageNet dataset.
  3"""
  4
  5__all__ = ["SplitTinyImageNet"]
  6
  7import logging
  8from typing import Callable
  9
 10import torch
 11from tinyimagenet import TinyImageNet
 12from torch.utils.data import Dataset, random_split
 13from torchvision import transforms
 14from torchvision.datasets import ImageFolder
 15
 16from clarena.cl_datasets import CLSplitDataset
 17
 18# always get logger for built-in logging in each module
 19pylogger = logging.getLogger(__name__)
 20
 21
 22class SplitTinyImageNet(CLSplitDataset):
 23    r"""Split TinyImageNet dataset. The [TinyImageNet dataset](http://vision.stanford.edu/teaching/cs231n/reports/2015/pdfs/yle_project.pdf) is smaller, more manageable version of the [Imagenet dataset](https://www.image-net.org). It consists of 100,000 training, 10,000 validation and 10,000 test images of 200 classes, each 64x64 color image."""
 24
 25    original_dataset_python_class: type[Dataset] = TinyImageNet
 26    r"""The original dataset class."""
 27
 28    def __init__(
 29        self,
 30        root: str,
 31        class_split: dict[int, list[int]],
 32        validation_percentage: float,
 33        batch_size: int | dict[int, int] = 1,
 34        num_workers: int | dict[int, int] = 0,
 35        custom_transforms: (
 36            Callable
 37            | transforms.Compose
 38            | None
 39            | dict[int, Callable | transforms.Compose | None]
 40        ) = None,
 41        repeat_channels: int | None | dict[int, int | None] = None,
 42        to_tensor: bool | dict[int, bool] = True,
 43        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
 44    ) -> None:
 45        r"""
 46        **Args:**
 47        - **root** (`str`): the root directory where the original TinyImageNet data 'tiny-imagenet-200/' live.
 48        - **class_split** (`dict[int, list[int]]`): the dict of classes for each task. The keys are task IDs ane the values are lists of class labels (integers starting from 0) to split for each task.
 49        - **validation_percentage** (`float`): The percentage to randomly split some training data into validation data.
 50        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
 51        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
 52        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
 53        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
 54        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
 55        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
 56        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
 57        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
 58        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
 59        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
 60        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
 61        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
 62        """
 63
 64        super().__init__(
 65            root=root,
 66            class_split=class_split,
 67            batch_size=batch_size,
 68            num_workers=num_workers,
 69            custom_transforms=custom_transforms,
 70            repeat_channels=repeat_channels,
 71            to_tensor=to_tensor,
 72            resize=resize,
 73        )
 74
 75        self.validation_percentage: float = validation_percentage
 76        r"""The percentage to randomly split some training data into validation data."""
 77
 78    def prepare_data(self) -> None:
 79        r"""Download the original TinyImagenet dataset if haven't."""
 80
 81        if self.task_id != 1:
 82            return  # download all original datasets only at the beginning of first task
 83
 84        TinyImageNet(self.root_t)
 85
 86        pylogger.debug(
 87            "The original TinyImageNet dataset has been downloaded to %s.",
 88            self.root_t,
 89        )
 90
 91    def get_subset_of_classes(self, dataset: ImageFolder) -> ImageFolder:
 92        r"""Get a subset of classes from the dataset of current classes of `self.task_id`. It is used when constructing the split.
 93
 94        **Args:**
 95        - **dataset** (`ImageFolder`): the dataset to retrieve subset from.
 96
 97        **Returns:**
 98        - **subset** (`ImageFolder`): the subset of classes from the dataset.
 99        """
100        classes = self.class_split[self.task_id - 1]
101
102        # get the indices of the dataset that belong to the classes
103        idx = [i for i, (_, target) in enumerate(dataset) if target in classes]
104
105        # subset the dataset by the indices, in-place operation
106        dataset.samples = [dataset.samples[i] for i in idx]  # samples is a list
107        dataset.targets = [dataset.targets[i] for i in idx]  # targets is a list
108
109        dataset.target_transform = (
110            self.target_transform()
111        )  # cl class mapping should be applied after the split
112
113        return dataset
114
115    def train_and_val_dataset(self) -> Dataset:
116        r"""Get the training and validation dataset of task `self.task_id`.
117
118        **Returns:**
119        - **train_dataset** (`Dataset`): the training dataset of task `self.task_id`.
120        - **val_dataset** (`Dataset`): the validation dataset of task `self.task_id`.
121        """
122        dataset_train_and_val = self.get_subset_of_classes(
123            TinyImageNet(
124                root=self.root_t,
125                split="train",
126                transform=self.train_and_val_transforms(),
127                # cl class mapping should be applied after the split
128            )
129        )
130
131        return random_split(
132            dataset_train_and_val,
133            lengths=[1 - self.validation_percentage, self.validation_percentage],
134            generator=torch.Generator().manual_seed(
135                42
136            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
137        )
138
139    def test_dataset(self) -> Dataset:
140        r"""Get the test dataset of task `self.task_id`.
141
142        **Returns:**
143        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
144        """
145        dataset_test = self.get_subset_of_classes(
146            TinyImageNet(
147                root=self.root_t,
148                split="val",
149                transform=self.train_and_val_transforms(),
150                # cl class mapping should be applied after the split
151            )
152        )
153
154        return dataset_test
class SplitTinyImageNet(clarena.cl_datasets.base.CLSplitDataset):
 23class SplitTinyImageNet(CLSplitDataset):
 24    r"""Split TinyImageNet dataset. The [TinyImageNet dataset](http://vision.stanford.edu/teaching/cs231n/reports/2015/pdfs/yle_project.pdf) is smaller, more manageable version of the [Imagenet dataset](https://www.image-net.org). It consists of 100,000 training, 10,000 validation and 10,000 test images of 200 classes, each 64x64 color image."""
 25
 26    original_dataset_python_class: type[Dataset] = TinyImageNet
 27    r"""The original dataset class."""
 28
 29    def __init__(
 30        self,
 31        root: str,
 32        class_split: dict[int, list[int]],
 33        validation_percentage: float,
 34        batch_size: int | dict[int, int] = 1,
 35        num_workers: int | dict[int, int] = 0,
 36        custom_transforms: (
 37            Callable
 38            | transforms.Compose
 39            | None
 40            | dict[int, Callable | transforms.Compose | None]
 41        ) = None,
 42        repeat_channels: int | None | dict[int, int | None] = None,
 43        to_tensor: bool | dict[int, bool] = True,
 44        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
 45    ) -> None:
 46        r"""
 47        **Args:**
 48        - **root** (`str`): the root directory where the original TinyImageNet data 'tiny-imagenet-200/' live.
 49        - **class_split** (`dict[int, list[int]]`): the dict of classes for each task. The keys are task IDs ane the values are lists of class labels (integers starting from 0) to split for each task.
 50        - **validation_percentage** (`float`): The percentage to randomly split some training data into validation data.
 51        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
 52        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
 53        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
 54        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
 55        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
 56        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
 57        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
 58        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
 59        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
 60        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
 61        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
 62        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
 63        """
 64
 65        super().__init__(
 66            root=root,
 67            class_split=class_split,
 68            batch_size=batch_size,
 69            num_workers=num_workers,
 70            custom_transforms=custom_transforms,
 71            repeat_channels=repeat_channels,
 72            to_tensor=to_tensor,
 73            resize=resize,
 74        )
 75
 76        self.validation_percentage: float = validation_percentage
 77        r"""The percentage to randomly split some training data into validation data."""
 78
 79    def prepare_data(self) -> None:
 80        r"""Download the original TinyImagenet dataset if haven't."""
 81
 82        if self.task_id != 1:
 83            return  # download all original datasets only at the beginning of first task
 84
 85        TinyImageNet(self.root_t)
 86
 87        pylogger.debug(
 88            "The original TinyImageNet dataset has been downloaded to %s.",
 89            self.root_t,
 90        )
 91
 92    def get_subset_of_classes(self, dataset: ImageFolder) -> ImageFolder:
 93        r"""Get a subset of classes from the dataset of current classes of `self.task_id`. It is used when constructing the split.
 94
 95        **Args:**
 96        - **dataset** (`ImageFolder`): the dataset to retrieve subset from.
 97
 98        **Returns:**
 99        - **subset** (`ImageFolder`): the subset of classes from the dataset.
100        """
101        classes = self.class_split[self.task_id - 1]
102
103        # get the indices of the dataset that belong to the classes
104        idx = [i for i, (_, target) in enumerate(dataset) if target in classes]
105
106        # subset the dataset by the indices, in-place operation
107        dataset.samples = [dataset.samples[i] for i in idx]  # samples is a list
108        dataset.targets = [dataset.targets[i] for i in idx]  # targets is a list
109
110        dataset.target_transform = (
111            self.target_transform()
112        )  # cl class mapping should be applied after the split
113
114        return dataset
115
116    def train_and_val_dataset(self) -> Dataset:
117        r"""Get the training and validation dataset of task `self.task_id`.
118
119        **Returns:**
120        - **train_dataset** (`Dataset`): the training dataset of task `self.task_id`.
121        - **val_dataset** (`Dataset`): the validation dataset of task `self.task_id`.
122        """
123        dataset_train_and_val = self.get_subset_of_classes(
124            TinyImageNet(
125                root=self.root_t,
126                split="train",
127                transform=self.train_and_val_transforms(),
128                # cl class mapping should be applied after the split
129            )
130        )
131
132        return random_split(
133            dataset_train_and_val,
134            lengths=[1 - self.validation_percentage, self.validation_percentage],
135            generator=torch.Generator().manual_seed(
136                42
137            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
138        )
139
140    def test_dataset(self) -> Dataset:
141        r"""Get the test dataset of task `self.task_id`.
142
143        **Returns:**
144        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
145        """
146        dataset_test = self.get_subset_of_classes(
147            TinyImageNet(
148                root=self.root_t,
149                split="val",
150                transform=self.train_and_val_transforms(),
151                # cl class mapping should be applied after the split
152            )
153        )
154
155        return dataset_test

Split TinyImageNet dataset. The TinyImageNet dataset is smaller, more manageable version of the Imagenet dataset. It consists of 100,000 training, 10,000 validation and 10,000 test images of 200 classes, each 64x64 color image.

SplitTinyImageNet( root: str, class_split: dict[int, list[int]], validation_percentage: float, batch_size: int | dict[int, int] = 1, num_workers: int | dict[int, int] = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType, dict[int, Union[Callable, torchvision.transforms.transforms.Compose, NoneType]]] = None, repeat_channels: int | None | dict[int, int | None] = None, to_tensor: bool | dict[int, bool] = True, resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None)
29    def __init__(
30        self,
31        root: str,
32        class_split: dict[int, list[int]],
33        validation_percentage: float,
34        batch_size: int | dict[int, int] = 1,
35        num_workers: int | dict[int, int] = 0,
36        custom_transforms: (
37            Callable
38            | transforms.Compose
39            | None
40            | dict[int, Callable | transforms.Compose | None]
41        ) = None,
42        repeat_channels: int | None | dict[int, int | None] = None,
43        to_tensor: bool | dict[int, bool] = True,
44        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
45    ) -> None:
46        r"""
47        **Args:**
48        - **root** (`str`): the root directory where the original TinyImageNet data 'tiny-imagenet-200/' live.
49        - **class_split** (`dict[int, list[int]]`): the dict of classes for each task. The keys are task IDs ane the values are lists of class labels (integers starting from 0) to split for each task.
50        - **validation_percentage** (`float`): The percentage to randomly split some training data into validation data.
51        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
52        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
53        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
54        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
55        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
56        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
57        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
58        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
59        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
60        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
61        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
62        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
63        """
64
65        super().__init__(
66            root=root,
67            class_split=class_split,
68            batch_size=batch_size,
69            num_workers=num_workers,
70            custom_transforms=custom_transforms,
71            repeat_channels=repeat_channels,
72            to_tensor=to_tensor,
73            resize=resize,
74        )
75
76        self.validation_percentage: float = validation_percentage
77        r"""The percentage to randomly split some training data into validation data."""

Args:

  • root (str): the root directory where the original TinyImageNet data 'tiny-imagenet-200/' live.
  • class_split (dict[int, list[int]]): the dict of classes for each task. The keys are task IDs ane the values are lists of class labels (integers starting from 0) to split for each task.
  • validation_percentage (float): The percentage to randomly split some training data into validation data.
  • batch_size (int | dict[int, int]): the batch size for train, val, and test dataloaders. If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an int, it is the same batch size for all tasks.
  • num_workers (int | dict[int, int]): the number of workers for dataloaders. If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an int, it is the same number of workers for all tasks.
  • custom_transforms (transform or transforms.Compose or None or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. ToTensor(), normalization, permute, and so on are not included. If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is None, no custom transforms are applied.
  • repeat_channels (int | None | dict of them): the number of channels to repeat for each task. Default is None, which means no repeat. If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an int, it is the same number of channels to repeat for all tasks. If it is None, no repeat is applied.
  • to_tensor (bool | dict[int, bool]): whether to include the ToTensor() transform. Default is True. If it is a dict, the keys are task IDs and the values are whether to include the ToTensor() transform for each task. If it is a single boolean value, it is applied to all tasks.
  • resize (tuple[int, int] | None or dict of them): the size to resize the images to. Default is None, which means no resize. If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is None, no resize is applied.
original_dataset_python_class: type[torch.utils.data.dataset.Dataset] = <class 'tinyimagenet.TinyImageNet'>

The original dataset class.

validation_percentage: float

The percentage to randomly split some training data into validation data.

def prepare_data(self) -> None:
79    def prepare_data(self) -> None:
80        r"""Download the original TinyImagenet dataset if haven't."""
81
82        if self.task_id != 1:
83            return  # download all original datasets only at the beginning of first task
84
85        TinyImageNet(self.root_t)
86
87        pylogger.debug(
88            "The original TinyImageNet dataset has been downloaded to %s.",
89            self.root_t,
90        )

Download the original TinyImagenet dataset if haven't.

def get_subset_of_classes( self, dataset: torchvision.datasets.folder.ImageFolder) -> torchvision.datasets.folder.ImageFolder:
 92    def get_subset_of_classes(self, dataset: ImageFolder) -> ImageFolder:
 93        r"""Get a subset of classes from the dataset of current classes of `self.task_id`. It is used when constructing the split.
 94
 95        **Args:**
 96        - **dataset** (`ImageFolder`): the dataset to retrieve subset from.
 97
 98        **Returns:**
 99        - **subset** (`ImageFolder`): the subset of classes from the dataset.
100        """
101        classes = self.class_split[self.task_id - 1]
102
103        # get the indices of the dataset that belong to the classes
104        idx = [i for i, (_, target) in enumerate(dataset) if target in classes]
105
106        # subset the dataset by the indices, in-place operation
107        dataset.samples = [dataset.samples[i] for i in idx]  # samples is a list
108        dataset.targets = [dataset.targets[i] for i in idx]  # targets is a list
109
110        dataset.target_transform = (
111            self.target_transform()
112        )  # cl class mapping should be applied after the split
113
114        return dataset

Get a subset of classes from the dataset of current classes of self.task_id. It is used when constructing the split.

Args:

  • dataset (ImageFolder): the dataset to retrieve subset from.

Returns:

  • subset (ImageFolder): the subset of classes from the dataset.
def train_and_val_dataset(self) -> torch.utils.data.dataset.Dataset:
116    def train_and_val_dataset(self) -> Dataset:
117        r"""Get the training and validation dataset of task `self.task_id`.
118
119        **Returns:**
120        - **train_dataset** (`Dataset`): the training dataset of task `self.task_id`.
121        - **val_dataset** (`Dataset`): the validation dataset of task `self.task_id`.
122        """
123        dataset_train_and_val = self.get_subset_of_classes(
124            TinyImageNet(
125                root=self.root_t,
126                split="train",
127                transform=self.train_and_val_transforms(),
128                # cl class mapping should be applied after the split
129            )
130        )
131
132        return random_split(
133            dataset_train_and_val,
134            lengths=[1 - self.validation_percentage, self.validation_percentage],
135            generator=torch.Generator().manual_seed(
136                42
137            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
138        )

Get the training and validation dataset of task self.task_id.

Returns:

  • train_dataset (Dataset): the training dataset of task self.task_id.
  • val_dataset (Dataset): the validation dataset of task self.task_id.
def test_dataset(self) -> torch.utils.data.dataset.Dataset:
140    def test_dataset(self) -> Dataset:
141        r"""Get the test dataset of task `self.task_id`.
142
143        **Returns:**
144        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
145        """
146        dataset_test = self.get_subset_of_classes(
147            TinyImageNet(
148                root=self.root_t,
149                split="val",
150                transform=self.train_and_val_transforms(),
151                # cl class mapping should be applied after the split
152            )
153        )
154
155        return dataset_test

Get the test dataset of task self.task_id.

Returns:

  • test_dataset (Dataset): the test dataset of task self.task_id.