clarena.cl_datasets.permuted_celeba

The submodule in cl_datasets for Permuted CelebA dataset.

  1r"""
  2The submodule in `cl_datasets` for Permuted CelebA dataset.
  3"""
  4
  5__all__ = ["PermutedCelebA"]
  6
  7import logging
  8from typing import Callable
  9
 10from torch.utils.data import Dataset
 11from torchvision.datasets import CelebA
 12from torchvision.transforms import transforms
 13
 14from clarena.cl_datasets import CLPermutedDataset
 15
 16# always get logger for built-in logging in each module
 17pylogger = logging.getLogger(__name__)
 18
 19
 20class PermutedCelebA(CLPermutedDataset):
 21    r"""Permuted CelebA dataset. The [CelebFaces Attributes Dataset (CelebA)](https://mmlab.ie.cuhk.edu.hk/projects/CelebA.html) is a large-scale celebrity faces dataset. It consists of 202,599 face images of 10,177 celebrity identities (classes), each 178x218 color image.
 22
 23    Note that the original CelebA dataset is not a classification dataset but an attributes dataset. We only use the identity of each face as the class label for classification.
 24    """
 25
 26    original_dataset_python_class: type[Dataset] = CelebA
 27    r"""The original dataset class."""
 28
 29    def __init__(
 30        self,
 31        root: str,
 32        num_tasks: int,
 33        batch_size: int | dict[int, int] = 1,
 34        num_workers: int | dict[int, int] = 0,
 35        custom_transforms: (
 36            Callable
 37            | transforms.Compose
 38            | None
 39            | dict[int, Callable | transforms.Compose | None]
 40        ) = None,
 41        repeat_channels: int | None | dict[int, int | None] = None,
 42        to_tensor: bool | dict[int, bool] = True,
 43        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
 44        permutation_mode: str = "first_channel_only",
 45        permutation_seeds: dict[int, int] | None = None,
 46    ) -> None:
 47        r"""
 48        **Args:**
 49        - **root** (`str`): the root directory where the original CelebA data 'CelebA/' live.
 50        - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`.
 51        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
 52        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
 53        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
 54        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
 55        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
 56        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
 57        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
 58        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
 59        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
 60        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
 61        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
 62        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
 63        - **permutation_mode** (`str`): the mode of permutation; one of:
 64            1. 'all': permute all pixels.
 65            2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
 66            3. 'first_channel_only': permute only the first channel.
 67        - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1.
 68        """
 69        super().__init__(
 70            root=root,
 71            num_tasks=num_tasks,
 72            batch_size=batch_size,
 73            num_workers=num_workers,
 74            custom_transforms=custom_transforms,
 75            repeat_channels=repeat_channels,
 76            to_tensor=to_tensor,
 77            resize=resize,
 78            permutation_mode=permutation_mode,
 79            permutation_seeds=permutation_seeds,
 80        )
 81
 82    def prepare_data(self) -> None:
 83        r"""Download the original CelebA dataset if haven't."""
 84        if self.task_id != 1:
 85            return  # download all original datasets only at the beginning of first task
 86
 87        CelebA(root=self.root_t, split="train", target_type="identity", download=True)
 88        CelebA(root=self.root_t, split="valid", target_type="identity", download=True)
 89        CelebA(root=self.root_t, split="test", target_type="identity", download=True)
 90
 91        pylogger.debug(
 92            "The original CelebA dataset has been downloaded to %s.", self.root_t
 93        )
 94
 95    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
 96        """Get the training and validation dataset of task `self.task_id`.
 97
 98        **Returns:**
 99        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`.
100        """
101        dataset_train = CelebA(
102            root=self.root_t,
103            split="train",
104            target_type="identity",
105            transform=self.train_and_val_transforms(),
106            target_transform=self.target_transform(),
107            download=False,
108        )
109
110        dataset_val = CelebA(
111            root=self.root_t,
112            split="valid",
113            target_type="identity",
114            transform=self.train_and_val_transforms(),
115            target_transform=self.target_transform(),
116            download=False,
117        )
118
119        return dataset_train, dataset_val
120
121    def test_dataset(self) -> Dataset:
122        r"""Get the test dataset of task `self.task_id`.
123
124        **Returns:**
125        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
126        """
127        dataset_test = CelebA(
128            root=self.root_t,
129            split="test",
130            target_type="identity",
131            transform=self.test_transforms(),
132            target_transform=self.target_transform(),
133            download=False,
134        )
135
136        return dataset_test
class PermutedCelebA(clarena.cl_datasets.base.CLPermutedDataset):
 21class PermutedCelebA(CLPermutedDataset):
 22    r"""Permuted CelebA dataset. The [CelebFaces Attributes Dataset (CelebA)](https://mmlab.ie.cuhk.edu.hk/projects/CelebA.html) is a large-scale celebrity faces dataset. It consists of 202,599 face images of 10,177 celebrity identities (classes), each 178x218 color image.
 23
 24    Note that the original CelebA dataset is not a classification dataset but an attributes dataset. We only use the identity of each face as the class label for classification.
 25    """
 26
 27    original_dataset_python_class: type[Dataset] = CelebA
 28    r"""The original dataset class."""
 29
 30    def __init__(
 31        self,
 32        root: str,
 33        num_tasks: int,
 34        batch_size: int | dict[int, int] = 1,
 35        num_workers: int | dict[int, int] = 0,
 36        custom_transforms: (
 37            Callable
 38            | transforms.Compose
 39            | None
 40            | dict[int, Callable | transforms.Compose | None]
 41        ) = None,
 42        repeat_channels: int | None | dict[int, int | None] = None,
 43        to_tensor: bool | dict[int, bool] = True,
 44        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
 45        permutation_mode: str = "first_channel_only",
 46        permutation_seeds: dict[int, int] | None = None,
 47    ) -> None:
 48        r"""
 49        **Args:**
 50        - **root** (`str`): the root directory where the original CelebA data 'CelebA/' live.
 51        - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`.
 52        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
 53        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
 54        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
 55        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
 56        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
 57        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
 58        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
 59        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
 60        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
 61        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
 62        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
 63        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
 64        - **permutation_mode** (`str`): the mode of permutation; one of:
 65            1. 'all': permute all pixels.
 66            2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
 67            3. 'first_channel_only': permute only the first channel.
 68        - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1.
 69        """
 70        super().__init__(
 71            root=root,
 72            num_tasks=num_tasks,
 73            batch_size=batch_size,
 74            num_workers=num_workers,
 75            custom_transforms=custom_transforms,
 76            repeat_channels=repeat_channels,
 77            to_tensor=to_tensor,
 78            resize=resize,
 79            permutation_mode=permutation_mode,
 80            permutation_seeds=permutation_seeds,
 81        )
 82
 83    def prepare_data(self) -> None:
 84        r"""Download the original CelebA dataset if haven't."""
 85        if self.task_id != 1:
 86            return  # download all original datasets only at the beginning of first task
 87
 88        CelebA(root=self.root_t, split="train", target_type="identity", download=True)
 89        CelebA(root=self.root_t, split="valid", target_type="identity", download=True)
 90        CelebA(root=self.root_t, split="test", target_type="identity", download=True)
 91
 92        pylogger.debug(
 93            "The original CelebA dataset has been downloaded to %s.", self.root_t
 94        )
 95
 96    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
 97        """Get the training and validation dataset of task `self.task_id`.
 98
 99        **Returns:**
100        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`.
101        """
102        dataset_train = CelebA(
103            root=self.root_t,
104            split="train",
105            target_type="identity",
106            transform=self.train_and_val_transforms(),
107            target_transform=self.target_transform(),
108            download=False,
109        )
110
111        dataset_val = CelebA(
112            root=self.root_t,
113            split="valid",
114            target_type="identity",
115            transform=self.train_and_val_transforms(),
116            target_transform=self.target_transform(),
117            download=False,
118        )
119
120        return dataset_train, dataset_val
121
122    def test_dataset(self) -> Dataset:
123        r"""Get the test dataset of task `self.task_id`.
124
125        **Returns:**
126        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
127        """
128        dataset_test = CelebA(
129            root=self.root_t,
130            split="test",
131            target_type="identity",
132            transform=self.test_transforms(),
133            target_transform=self.target_transform(),
134            download=False,
135        )
136
137        return dataset_test

Permuted CelebA dataset. The CelebFaces Attributes Dataset (CelebA) is a large-scale celebrity faces dataset. It consists of 202,599 face images of 10,177 celebrity identities (classes), each 178x218 color image.

Note that the original CelebA dataset is not a classification dataset but an attributes dataset. We only use the identity of each face as the class label for classification.

PermutedCelebA( root: str, num_tasks: int, batch_size: int | dict[int, int] = 1, num_workers: int | dict[int, int] = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType, dict[int, Union[Callable, torchvision.transforms.transforms.Compose, NoneType]]] = None, repeat_channels: int | None | dict[int, int | None] = None, to_tensor: bool | dict[int, bool] = True, resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, permutation_mode: str = 'first_channel_only', permutation_seeds: dict[int, int] | None = None)
30    def __init__(
31        self,
32        root: str,
33        num_tasks: int,
34        batch_size: int | dict[int, int] = 1,
35        num_workers: int | dict[int, int] = 0,
36        custom_transforms: (
37            Callable
38            | transforms.Compose
39            | None
40            | dict[int, Callable | transforms.Compose | None]
41        ) = None,
42        repeat_channels: int | None | dict[int, int | None] = None,
43        to_tensor: bool | dict[int, bool] = True,
44        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
45        permutation_mode: str = "first_channel_only",
46        permutation_seeds: dict[int, int] | None = None,
47    ) -> None:
48        r"""
49        **Args:**
50        - **root** (`str`): the root directory where the original CelebA data 'CelebA/' live.
51        - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`.
52        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
53        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
54        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
55        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
56        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
57        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
58        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
59        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
60        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
61        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
62        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
63        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
64        - **permutation_mode** (`str`): the mode of permutation; one of:
65            1. 'all': permute all pixels.
66            2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
67            3. 'first_channel_only': permute only the first channel.
68        - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1.
69        """
70        super().__init__(
71            root=root,
72            num_tasks=num_tasks,
73            batch_size=batch_size,
74            num_workers=num_workers,
75            custom_transforms=custom_transforms,
76            repeat_channels=repeat_channels,
77            to_tensor=to_tensor,
78            resize=resize,
79            permutation_mode=permutation_mode,
80            permutation_seeds=permutation_seeds,
81        )

Args:

  • root (str): the root directory where the original CelebA data 'CelebA/' live.
  • num_tasks (int): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to num_tasks.
  • batch_size (int | dict[int, int]): the batch size for train, val, and test dataloaders. If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an int, it is the same batch size for all tasks.
  • num_workers (int | dict[int, int]): the number of workers for dataloaders. If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an int, it is the same number of workers for all tasks.
  • custom_transforms (transform or transforms.Compose or None or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. ToTensor(), normalization, permute, and so on are not included. If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is None, no custom transforms are applied.
  • repeat_channels (int | None | dict of them): the number of channels to repeat for each task. Default is None, which means no repeat. If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an int, it is the same number of channels to repeat for all tasks. If it is None, no repeat is applied.
  • to_tensor (bool | dict[int, bool]): whether to include the ToTensor() transform. Default is True. If it is a dict, the keys are task IDs and the values are whether to include the ToTensor() transform for each task. If it is a single boolean value, it is applied to all tasks.
  • resize (tuple[int, int] | None or dict of them): the size to resize the images to. Default is None, which means no resize. If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is None, no resize is applied.
  • permutation_mode (str): the mode of permutation; one of:
    1. 'all': permute all pixels.
    2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
    3. 'first_channel_only': permute only the first channel.
  • permutation_seeds (dict[int, int] | None): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is None, which creates a dict of seeds from 0 to num_tasks-1.
original_dataset_python_class: type[torch.utils.data.dataset.Dataset] = <class 'torchvision.datasets.celeba.CelebA'>

The original dataset class.

def prepare_data(self) -> None:
83    def prepare_data(self) -> None:
84        r"""Download the original CelebA dataset if haven't."""
85        if self.task_id != 1:
86            return  # download all original datasets only at the beginning of first task
87
88        CelebA(root=self.root_t, split="train", target_type="identity", download=True)
89        CelebA(root=self.root_t, split="valid", target_type="identity", download=True)
90        CelebA(root=self.root_t, split="test", target_type="identity", download=True)
91
92        pylogger.debug(
93            "The original CelebA dataset has been downloaded to %s.", self.root_t
94        )

Download the original CelebA dataset if haven't.

def train_and_val_dataset( self) -> tuple[torch.utils.data.dataset.Dataset, torch.utils.data.dataset.Dataset]:
 96    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
 97        """Get the training and validation dataset of task `self.task_id`.
 98
 99        **Returns:**
100        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`.
101        """
102        dataset_train = CelebA(
103            root=self.root_t,
104            split="train",
105            target_type="identity",
106            transform=self.train_and_val_transforms(),
107            target_transform=self.target_transform(),
108            download=False,
109        )
110
111        dataset_val = CelebA(
112            root=self.root_t,
113            split="valid",
114            target_type="identity",
115            transform=self.train_and_val_transforms(),
116            target_transform=self.target_transform(),
117            download=False,
118        )
119
120        return dataset_train, dataset_val

Get the training and validation dataset of task self.task_id.

Returns:

  • train_and_val_dataset (tuple[Dataset, Dataset]): the train and validation dataset of task self.task_id.
def test_dataset(self) -> torch.utils.data.dataset.Dataset:
122    def test_dataset(self) -> Dataset:
123        r"""Get the test dataset of task `self.task_id`.
124
125        **Returns:**
126        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
127        """
128        dataset_test = CelebA(
129            root=self.root_t,
130            split="test",
131            target_type="identity",
132            transform=self.test_transforms(),
133            target_transform=self.target_transform(),
134            download=False,
135        )
136
137        return dataset_test

Get the test dataset of task self.task_id.

Returns:

  • test_dataset (Dataset): the test dataset of task self.task_id.