clarena.cl_datasets.split_cub2002011

The submodule in cl_datasets for Split CUB-200-2011 dataset.

View Source

  1r"""
  2The submodule in `cl_datasets` for Split CUB-200-2011 dataset.
  3"""
  4
  5__all__ = ["SplitCUB2002011"]
  6
  7import logging
  8from typing import Callable
  9
 10import torch
 11from torch.utils.data import Dataset, random_split
 12from torchvision import transforms
 13
 14from clarena.cl_datasets import CLSplitDataset
 15from clarena.stl_datasets.raw import CUB2002011
 16
 17# always get logger for built-in logging in each module
 18pylogger = logging.getLogger(__name__)
 19
 20
 21class SplitCUB2002011(CLSplitDataset):
 22    r"""Split CUB-200-2011 dataset. The [CUB (Caltech-UCSD Birds)-200-2011)](https://www.vision.caltech.edu/datasets/cub_200_2011/) is a bird image dataset. It consists of 100,000 training, 10,000 validation, 10,000 test images of 200 bird species (classes), each 64x64 color image."""
 23
 24    original_dataset_python_class: type[Dataset] = CUB2002011
 25    r"""The original dataset class."""
 26
 27    def __init__(
 28        self,
 29        root: str,
 30        class_split: dict[int, list[int]],
 31        validation_percentage: float,
 32        batch_size: int | dict[int, int] = 1,
 33        num_workers: int | dict[int, int] = 0,
 34        custom_transforms: (
 35            Callable
 36            | transforms.Compose
 37            | None
 38            | dict[int, Callable | transforms.Compose | None]
 39        ) = None,
 40        repeat_channels: int | None | dict[int, int | None] = None,
 41        to_tensor: bool | dict[int, bool] = True,
 42        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
 43    ) -> None:
 44        r"""
 45        **Args:**
 46        - **root** (`str`): the root directory where the original CUB-200-2011 data 'CUB_200_2011/' live.
 47        - **class_split** (`dict[int, list[int]]`): the dict of classes for each task. The keys are task IDs ane the values are lists of class labels (integers starting from 0) to split for each task.
 48        - **validation_percentage** (`float`): The percentage to randomly split some training data into validation data.
 49        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
 50        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
 51        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
 52        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
 53        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
 54        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
 55        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
 56        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
 57        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
 58        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
 59        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
 60        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
 61        """
 62
 63        super().__init__(
 64            root=root,
 65            class_split=class_split,
 66            batch_size=batch_size,
 67            num_workers=num_workers,
 68            custom_transforms=custom_transforms,
 69            repeat_channels=repeat_channels,
 70            to_tensor=to_tensor,
 71            resize=resize,
 72        )
 73
 74        self.validation_percentage: float = validation_percentage
 75        r"""The percentage to randomly split some training data into validation data."""
 76
 77    def prepare_data(self) -> None:
 78        r"""Download the original CUB-200-2011 dataset if haven't."""
 79
 80        if self.task_id != 1:
 81            return  # download all original datasets only at the beginning of first task
 82
 83        CUB2002011(root=self.root_t, train=True, download=True)
 84        CUB2002011(root=self.root_t, train=False, download=True)
 85
 86        pylogger.debug(
 87            "The original CUB-200-2011 dataset has been downloaded to %s.",
 88            self.root_t,
 89        )
 90
 91    def get_subset_of_classes(self, dataset: Dataset) -> Dataset:
 92        r"""Get a subset of classes from the dataset of current classes of `self.task_id`. It is used when constructing the split. It must be implemented by subclasses.
 93
 94        **Args:**
 95        - **dataset** (`Dataset`): the dataset to retrieve subset from.
 96
 97        **Returns:**
 98        - **subset** (`Dataset`): the subset of classes from the dataset.
 99        """
100        classes = self.class_split[self.task_id]
101
102        # get the indices of the dataset that belong to the classes
103        idx = [i for i, (_, target) in enumerate(dataset) if target in classes]
104
105        # subset the dataset by the indices, in-place operation
106        dataset.data = dataset.data.iloc[idx]  # data is a Pandas DataFrame
107
108        dataset.target_transform = (
109            self.target_transform()
110        )  # cl class mapping should be applied after the split
111
112        return dataset
113
114    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
115        r"""Get the training and validation dataset of task `self.task_id`.
116
117        **Returns:**
118        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`.
119        """
120        dataset_train_and_val = self.get_subset_of_classes(
121            CUB2002011(
122                root=self.root_t,
123                train=True,
124                transform=self.train_and_val_transforms(),
125                # cl class mapping should be applied after the split
126                download=False,
127            )
128        )
129
130        return random_split(
131            dataset_train_and_val,
132            lengths=[1 - self.validation_percentage, self.validation_percentage],
133            generator=torch.Generator().manual_seed(
134                42
135            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
136        )
137
138    def test_dataset(self) -> Dataset:
139        r"""Get the test dataset of task `self.task_id`.
140
141        **Returns:**
142        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
143        """
144
145        dataset_test = self.get_subset_of_classes(
146            CUB2002011(
147                root=self.root_t,
148                train=False,
149                transform=self.test_transforms(),
150                # cl class mapping should be applied after the split
151                download=False,
152            )
153        )
154
155        return dataset_test

class SplitCUB2002011(clarena.cl_datasets.base.CLSplitDataset): View Source

 22class SplitCUB2002011(CLSplitDataset):
 23    r"""Split CUB-200-2011 dataset. The [CUB (Caltech-UCSD Birds)-200-2011)](https://www.vision.caltech.edu/datasets/cub_200_2011/) is a bird image dataset. It consists of 100,000 training, 10,000 validation, 10,000 test images of 200 bird species (classes), each 64x64 color image."""
 24
 25    original_dataset_python_class: type[Dataset] = CUB2002011
 26    r"""The original dataset class."""
 27
 28    def __init__(
 29        self,
 30        root: str,
 31        class_split: dict[int, list[int]],
 32        validation_percentage: float,
 33        batch_size: int | dict[int, int] = 1,
 34        num_workers: int | dict[int, int] = 0,
 35        custom_transforms: (
 36            Callable
 37            | transforms.Compose
 38            | None
 39            | dict[int, Callable | transforms.Compose | None]
 40        ) = None,
 41        repeat_channels: int | None | dict[int, int | None] = None,
 42        to_tensor: bool | dict[int, bool] = True,
 43        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
 44    ) -> None:
 45        r"""
 46        **Args:**
 47        - **root** (`str`): the root directory where the original CUB-200-2011 data 'CUB_200_2011/' live.
 48        - **class_split** (`dict[int, list[int]]`): the dict of classes for each task. The keys are task IDs ane the values are lists of class labels (integers starting from 0) to split for each task.
 49        - **validation_percentage** (`float`): The percentage to randomly split some training data into validation data.
 50        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
 51        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
 52        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
 53        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
 54        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
 55        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
 56        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
 57        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
 58        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
 59        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
 60        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
 61        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
 62        """
 63
 64        super().__init__(
 65            root=root,
 66            class_split=class_split,
 67            batch_size=batch_size,
 68            num_workers=num_workers,
 69            custom_transforms=custom_transforms,
 70            repeat_channels=repeat_channels,
 71            to_tensor=to_tensor,
 72            resize=resize,
 73        )
 74
 75        self.validation_percentage: float = validation_percentage
 76        r"""The percentage to randomly split some training data into validation data."""
 77
 78    def prepare_data(self) -> None:
 79        r"""Download the original CUB-200-2011 dataset if haven't."""
 80
 81        if self.task_id != 1:
 82            return  # download all original datasets only at the beginning of first task
 83
 84        CUB2002011(root=self.root_t, train=True, download=True)
 85        CUB2002011(root=self.root_t, train=False, download=True)
 86
 87        pylogger.debug(
 88            "The original CUB-200-2011 dataset has been downloaded to %s.",
 89            self.root_t,
 90        )
 91
 92    def get_subset_of_classes(self, dataset: Dataset) -> Dataset:
 93        r"""Get a subset of classes from the dataset of current classes of `self.task_id`. It is used when constructing the split. It must be implemented by subclasses.
 94
 95        **Args:**
 96        - **dataset** (`Dataset`): the dataset to retrieve subset from.
 97
 98        **Returns:**
 99        - **subset** (`Dataset`): the subset of classes from the dataset.
100        """
101        classes = self.class_split[self.task_id]
102
103        # get the indices of the dataset that belong to the classes
104        idx = [i for i, (_, target) in enumerate(dataset) if target in classes]
105
106        # subset the dataset by the indices, in-place operation
107        dataset.data = dataset.data.iloc[idx]  # data is a Pandas DataFrame
108
109        dataset.target_transform = (
110            self.target_transform()
111        )  # cl class mapping should be applied after the split
112
113        return dataset
114
115    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
116        r"""Get the training and validation dataset of task `self.task_id`.
117
118        **Returns:**
119        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`.
120        """
121        dataset_train_and_val = self.get_subset_of_classes(
122            CUB2002011(
123                root=self.root_t,
124                train=True,
125                transform=self.train_and_val_transforms(),
126                # cl class mapping should be applied after the split
127                download=False,
128            )
129        )
130
131        return random_split(
132            dataset_train_and_val,
133            lengths=[1 - self.validation_percentage, self.validation_percentage],
134            generator=torch.Generator().manual_seed(
135                42
136            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
137        )
138
139    def test_dataset(self) -> Dataset:
140        r"""Get the test dataset of task `self.task_id`.
141
142        **Returns:**
143        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
144        """
145
146        dataset_test = self.get_subset_of_classes(
147            CUB2002011(
148                root=self.root_t,
149                train=False,
150                transform=self.test_transforms(),
151                # cl class mapping should be applied after the split
152                download=False,
153            )
154        )
155
156        return dataset_test

Split CUB-200-2011 dataset. The CUB (Caltech-UCSD Birds)-200-2011) is a bird image dataset. It consists of 100,000 training, 10,000 validation, 10,000 test images of 200 bird species (classes), each 64x64 color image.

SplitCUB2002011( root: str, class_split: dict[int, list[int]], validation_percentage: float, batch_size: int | dict[int, int] = 1, num_workers: int | dict[int, int] = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType, dict[int, Union[Callable, torchvision.transforms.transforms.Compose, NoneType]]] = None, repeat_channels: int | None | dict[int, int | None] = None, to_tensor: bool | dict[int, bool] = True, resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None) View Source

28    def __init__(
29        self,
30        root: str,
31        class_split: dict[int, list[int]],
32        validation_percentage: float,
33        batch_size: int | dict[int, int] = 1,
34        num_workers: int | dict[int, int] = 0,
35        custom_transforms: (
36            Callable
37            | transforms.Compose
38            | None
39            | dict[int, Callable | transforms.Compose | None]
40        ) = None,
41        repeat_channels: int | None | dict[int, int | None] = None,
42        to_tensor: bool | dict[int, bool] = True,
43        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
44    ) -> None:
45        r"""
46        **Args:**
47        - **root** (`str`): the root directory where the original CUB-200-2011 data 'CUB_200_2011/' live.
48        - **class_split** (`dict[int, list[int]]`): the dict of classes for each task. The keys are task IDs ane the values are lists of class labels (integers starting from 0) to split for each task.
49        - **validation_percentage** (`float`): The percentage to randomly split some training data into validation data.
50        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
51        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
52        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
53        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
54        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
55        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
56        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
57        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
58        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
59        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
60        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
61        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
62        """
63
64        super().__init__(
65            root=root,
66            class_split=class_split,
67            batch_size=batch_size,
68            num_workers=num_workers,
69            custom_transforms=custom_transforms,
70            repeat_channels=repeat_channels,
71            to_tensor=to_tensor,
72            resize=resize,
73        )
74
75        self.validation_percentage: float = validation_percentage
76        r"""The percentage to randomly split some training data into validation data."""

Args:

root (str): the root directory where the original CUB-200-2011 data 'CUB_200_2011/' live.
class_split (dict[int, list[int]]): the dict of classes for each task. The keys are task IDs ane the values are lists of class labels (integers starting from 0) to split for each task.
validation_percentage (float): The percentage to randomly split some training data into validation data.
batch_size (int | dict[int, int]): the batch size for train, val, and test dataloaders. If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an int, it is the same batch size for all tasks.
num_workers (int | dict[int, int]): the number of workers for dataloaders. If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an int, it is the same number of workers for all tasks.
custom_transforms (transform or transforms.Compose or None or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. ToTensor(), normalization, permute, and so on are not included. If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is None, no custom transforms are applied.
repeat_channels (int | None | dict of them): the number of channels to repeat for each task. Default is None, which means no repeat. If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an int, it is the same number of channels to repeat for all tasks. If it is None, no repeat is applied.
to_tensor (bool | dict[int, bool]): whether to include the ToTensor() transform. Default is True. If it is a dict, the keys are task IDs and the values are whether to include the ToTensor() transform for each task. If it is a single boolean value, it is applied to all tasks.
resize (tuple[int, int] | None or dict of them): the size to resize the images to. Default is None, which means no resize. If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is None, no resize is applied.

original_dataset_python_class: type[torch.utils.data.dataset.Dataset] = <class 'clarena.stl_datasets.raw.cub2002011.CUB2002011'>

The original dataset class.

validation_percentage: float

The percentage to randomly split some training data into validation data.

def prepare_data(self) -> None: View Source

78    def prepare_data(self) -> None:
79        r"""Download the original CUB-200-2011 dataset if haven't."""
80
81        if self.task_id != 1:
82            return  # download all original datasets only at the beginning of first task
83
84        CUB2002011(root=self.root_t, train=True, download=True)
85        CUB2002011(root=self.root_t, train=False, download=True)
86
87        pylogger.debug(
88            "The original CUB-200-2011 dataset has been downloaded to %s.",
89            self.root_t,
90        )

Download the original CUB-200-2011 dataset if haven't.

def get_subset_of_classes( self, dataset: torch.utils.data.dataset.Dataset) -> torch.utils.data.dataset.Dataset: View Source

 92    def get_subset_of_classes(self, dataset: Dataset) -> Dataset:
 93        r"""Get a subset of classes from the dataset of current classes of `self.task_id`. It is used when constructing the split. It must be implemented by subclasses.
 94
 95        **Args:**
 96        - **dataset** (`Dataset`): the dataset to retrieve subset from.
 97
 98        **Returns:**
 99        - **subset** (`Dataset`): the subset of classes from the dataset.
100        """
101        classes = self.class_split[self.task_id]
102
103        # get the indices of the dataset that belong to the classes
104        idx = [i for i, (_, target) in enumerate(dataset) if target in classes]
105
106        # subset the dataset by the indices, in-place operation
107        dataset.data = dataset.data.iloc[idx]  # data is a Pandas DataFrame
108
109        dataset.target_transform = (
110            self.target_transform()
111        )  # cl class mapping should be applied after the split
112
113        return dataset

Get a subset of classes from the dataset of current classes of self.task_id. It is used when constructing the split. It must be implemented by subclasses.

Args:

dataset (Dataset): the dataset to retrieve subset from.

Returns:

subset (Dataset): the subset of classes from the dataset.

def train_and_val_dataset( self) -> tuple[torch.utils.data.dataset.Dataset, torch.utils.data.dataset.Dataset]: View Source

115    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
116        r"""Get the training and validation dataset of task `self.task_id`.
117
118        **Returns:**
119        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`.
120        """
121        dataset_train_and_val = self.get_subset_of_classes(
122            CUB2002011(
123                root=self.root_t,
124                train=True,
125                transform=self.train_and_val_transforms(),
126                # cl class mapping should be applied after the split
127                download=False,
128            )
129        )
130
131        return random_split(
132            dataset_train_and_val,
133            lengths=[1 - self.validation_percentage, self.validation_percentage],
134            generator=torch.Generator().manual_seed(
135                42
136            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
137        )

Get the training and validation dataset of task self.task_id.

Returns:

train_and_val_dataset (tuple[Dataset, Dataset]): the train and validation dataset of task self.task_id.

def test_dataset(self) -> torch.utils.data.dataset.Dataset: View Source

139    def test_dataset(self) -> Dataset:
140        r"""Get the test dataset of task `self.task_id`.
141
142        **Returns:**
143        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
144        """
145
146        dataset_test = self.get_subset_of_classes(
147            CUB2002011(
148                root=self.root_t,
149                train=False,
150                transform=self.test_transforms(),
151                # cl class mapping should be applied after the split
152                download=False,
153            )
154        )
155
156        return dataset_test

Get the test dataset of task self.task_id.

Returns:

test_dataset (Dataset): the test dataset of task self.task_id.