clarena.cl_datasets.permuted_celeba
The submodule in cl_datasets for Permuted CelebA dataset.
1r""" 2The submodule in `cl_datasets` for Permuted CelebA dataset. 3""" 4 5__all__ = ["PermutedCelebA"] 6 7import logging 8from typing import Callable 9 10from torch.utils.data import Dataset 11from torchvision.datasets import CelebA 12from torchvision.transforms import transforms 13 14from clarena.cl_datasets import CLPermutedDataset 15 16# always get logger for built-in logging in each module 17pylogger = logging.getLogger(__name__) 18 19 20class PermutedCelebA(CLPermutedDataset): 21 r"""Permuted CelebA dataset. The [CelebFaces Attributes Dataset (CelebA)](https://mmlab.ie.cuhk.edu.hk/projects/CelebA.html) is a large-scale celebrity faces dataset. It consists of 202,599 face images of 10,177 celebrity identities (classes), each 178x218 color image. 22 23 Note that the original CelebA dataset is not a classification dataset but an attributes dataset. We only use the identity of each face as the class label for classification. 24 """ 25 26 original_dataset_python_class: type[Dataset] = CelebA 27 r"""The original dataset class.""" 28 29 def __init__( 30 self, 31 root: str, 32 num_tasks: int, 33 batch_size: int | dict[int, int] = 1, 34 num_workers: int | dict[int, int] = 0, 35 custom_transforms: ( 36 Callable 37 | transforms.Compose 38 | None 39 | dict[int, Callable | transforms.Compose | None] 40 ) = None, 41 repeat_channels: int | None | dict[int, int | None] = None, 42 to_tensor: bool | dict[int, bool] = True, 43 resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, 44 permutation_mode: str = "first_channel_only", 45 permutation_seeds: dict[int, int] | None = None, 46 ) -> None: 47 r""" 48 **Args:** 49 - **root** (`str`): the root directory where the original CelebA data 'CelebA/' live. 50 - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`. 51 - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders. 52 If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks. 53 - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders. 54 If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks. 55 - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included. 56 If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied. 57 - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat. 58 If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied. 59 - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`. 60 If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks. 61 - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize. 62 If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied. 63 - **permutation_mode** (`str`): the mode of permutation; one of: 64 1. 'all': permute all pixels. 65 2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order. 66 3. 'first_channel_only': permute only the first channel. 67 - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1. 68 """ 69 super().__init__( 70 root=root, 71 num_tasks=num_tasks, 72 batch_size=batch_size, 73 num_workers=num_workers, 74 custom_transforms=custom_transforms, 75 repeat_channels=repeat_channels, 76 to_tensor=to_tensor, 77 resize=resize, 78 permutation_mode=permutation_mode, 79 permutation_seeds=permutation_seeds, 80 ) 81 82 def prepare_data(self) -> None: 83 r"""Download the original CelebA dataset if haven't.""" 84 if self.task_id != 1: 85 return # download all original datasets only at the beginning of first task 86 87 CelebA(root=self.root_t, split="train", target_type="identity", download=True) 88 CelebA(root=self.root_t, split="valid", target_type="identity", download=True) 89 CelebA(root=self.root_t, split="test", target_type="identity", download=True) 90 91 pylogger.debug( 92 "The original CelebA dataset has been downloaded to %s.", self.root_t 93 ) 94 95 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 96 """Get the training and validation dataset of task `self.task_id`. 97 98 **Returns:** 99 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`. 100 """ 101 dataset_train = CelebA( 102 root=self.root_t, 103 split="train", 104 target_type="identity", 105 transform=self.train_and_val_transforms(), 106 target_transform=self.target_transform(), 107 download=False, 108 ) 109 110 dataset_val = CelebA( 111 root=self.root_t, 112 split="valid", 113 target_type="identity", 114 transform=self.train_and_val_transforms(), 115 target_transform=self.target_transform(), 116 download=False, 117 ) 118 119 return dataset_train, dataset_val 120 121 def test_dataset(self) -> Dataset: 122 r"""Get the test dataset of task `self.task_id`. 123 124 **Returns:** 125 - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`. 126 """ 127 dataset_test = CelebA( 128 root=self.root_t, 129 split="test", 130 target_type="identity", 131 transform=self.test_transforms(), 132 target_transform=self.target_transform(), 133 download=False, 134 ) 135 136 return dataset_test
class
PermutedCelebA(clarena.cl_datasets.base.CLPermutedDataset):
21class PermutedCelebA(CLPermutedDataset): 22 r"""Permuted CelebA dataset. The [CelebFaces Attributes Dataset (CelebA)](https://mmlab.ie.cuhk.edu.hk/projects/CelebA.html) is a large-scale celebrity faces dataset. It consists of 202,599 face images of 10,177 celebrity identities (classes), each 178x218 color image. 23 24 Note that the original CelebA dataset is not a classification dataset but an attributes dataset. We only use the identity of each face as the class label for classification. 25 """ 26 27 original_dataset_python_class: type[Dataset] = CelebA 28 r"""The original dataset class.""" 29 30 def __init__( 31 self, 32 root: str, 33 num_tasks: int, 34 batch_size: int | dict[int, int] = 1, 35 num_workers: int | dict[int, int] = 0, 36 custom_transforms: ( 37 Callable 38 | transforms.Compose 39 | None 40 | dict[int, Callable | transforms.Compose | None] 41 ) = None, 42 repeat_channels: int | None | dict[int, int | None] = None, 43 to_tensor: bool | dict[int, bool] = True, 44 resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, 45 permutation_mode: str = "first_channel_only", 46 permutation_seeds: dict[int, int] | None = None, 47 ) -> None: 48 r""" 49 **Args:** 50 - **root** (`str`): the root directory where the original CelebA data 'CelebA/' live. 51 - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`. 52 - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders. 53 If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks. 54 - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders. 55 If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks. 56 - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included. 57 If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied. 58 - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat. 59 If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied. 60 - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`. 61 If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks. 62 - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize. 63 If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied. 64 - **permutation_mode** (`str`): the mode of permutation; one of: 65 1. 'all': permute all pixels. 66 2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order. 67 3. 'first_channel_only': permute only the first channel. 68 - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1. 69 """ 70 super().__init__( 71 root=root, 72 num_tasks=num_tasks, 73 batch_size=batch_size, 74 num_workers=num_workers, 75 custom_transforms=custom_transforms, 76 repeat_channels=repeat_channels, 77 to_tensor=to_tensor, 78 resize=resize, 79 permutation_mode=permutation_mode, 80 permutation_seeds=permutation_seeds, 81 ) 82 83 def prepare_data(self) -> None: 84 r"""Download the original CelebA dataset if haven't.""" 85 if self.task_id != 1: 86 return # download all original datasets only at the beginning of first task 87 88 CelebA(root=self.root_t, split="train", target_type="identity", download=True) 89 CelebA(root=self.root_t, split="valid", target_type="identity", download=True) 90 CelebA(root=self.root_t, split="test", target_type="identity", download=True) 91 92 pylogger.debug( 93 "The original CelebA dataset has been downloaded to %s.", self.root_t 94 ) 95 96 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 97 """Get the training and validation dataset of task `self.task_id`. 98 99 **Returns:** 100 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`. 101 """ 102 dataset_train = CelebA( 103 root=self.root_t, 104 split="train", 105 target_type="identity", 106 transform=self.train_and_val_transforms(), 107 target_transform=self.target_transform(), 108 download=False, 109 ) 110 111 dataset_val = CelebA( 112 root=self.root_t, 113 split="valid", 114 target_type="identity", 115 transform=self.train_and_val_transforms(), 116 target_transform=self.target_transform(), 117 download=False, 118 ) 119 120 return dataset_train, dataset_val 121 122 def test_dataset(self) -> Dataset: 123 r"""Get the test dataset of task `self.task_id`. 124 125 **Returns:** 126 - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`. 127 """ 128 dataset_test = CelebA( 129 root=self.root_t, 130 split="test", 131 target_type="identity", 132 transform=self.test_transforms(), 133 target_transform=self.target_transform(), 134 download=False, 135 ) 136 137 return dataset_test
Permuted CelebA dataset. The CelebFaces Attributes Dataset (CelebA) is a large-scale celebrity faces dataset. It consists of 202,599 face images of 10,177 celebrity identities (classes), each 178x218 color image.
Note that the original CelebA dataset is not a classification dataset but an attributes dataset. We only use the identity of each face as the class label for classification.
PermutedCelebA( root: str, num_tasks: int, batch_size: int | dict[int, int] = 1, num_workers: int | dict[int, int] = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType, dict[int, Union[Callable, torchvision.transforms.transforms.Compose, NoneType]]] = None, repeat_channels: int | None | dict[int, int | None] = None, to_tensor: bool | dict[int, bool] = True, resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, permutation_mode: str = 'first_channel_only', permutation_seeds: dict[int, int] | None = None)
30 def __init__( 31 self, 32 root: str, 33 num_tasks: int, 34 batch_size: int | dict[int, int] = 1, 35 num_workers: int | dict[int, int] = 0, 36 custom_transforms: ( 37 Callable 38 | transforms.Compose 39 | None 40 | dict[int, Callable | transforms.Compose | None] 41 ) = None, 42 repeat_channels: int | None | dict[int, int | None] = None, 43 to_tensor: bool | dict[int, bool] = True, 44 resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, 45 permutation_mode: str = "first_channel_only", 46 permutation_seeds: dict[int, int] | None = None, 47 ) -> None: 48 r""" 49 **Args:** 50 - **root** (`str`): the root directory where the original CelebA data 'CelebA/' live. 51 - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`. 52 - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders. 53 If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks. 54 - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders. 55 If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks. 56 - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included. 57 If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied. 58 - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat. 59 If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied. 60 - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`. 61 If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks. 62 - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize. 63 If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied. 64 - **permutation_mode** (`str`): the mode of permutation; one of: 65 1. 'all': permute all pixels. 66 2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order. 67 3. 'first_channel_only': permute only the first channel. 68 - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1. 69 """ 70 super().__init__( 71 root=root, 72 num_tasks=num_tasks, 73 batch_size=batch_size, 74 num_workers=num_workers, 75 custom_transforms=custom_transforms, 76 repeat_channels=repeat_channels, 77 to_tensor=to_tensor, 78 resize=resize, 79 permutation_mode=permutation_mode, 80 permutation_seeds=permutation_seeds, 81 )
Args:
- root (
str): the root directory where the original CelebA data 'CelebA/' live. - num_tasks (
int): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 tonum_tasks. - batch_size (
int|dict[int, int]): the batch size for train, val, and test dataloaders. If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is anint, it is the same batch size for all tasks. - num_workers (
int|dict[int, int]): the number of workers for dataloaders. If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is anint, it is the same number of workers for all tasks. - custom_transforms (
transformortransforms.ComposeorNoneor dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform.ToTensor(), normalization, permute, and so on are not included. If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it isNone, no custom transforms are applied. - repeat_channels (
int|None| dict of them): the number of channels to repeat for each task. Default isNone, which means no repeat. If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is anint, it is the same number of channels to repeat for all tasks. If it isNone, no repeat is applied. - to_tensor (
bool|dict[int, bool]): whether to include theToTensor()transform. Default isTrue. If it is a dict, the keys are task IDs and the values are whether to include theToTensor()transform for each task. If it is a single boolean value, it is applied to all tasks. - resize (
tuple[int, int]|Noneor dict of them): the size to resize the images to. Default isNone, which means no resize. If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it isNone, no resize is applied. - permutation_mode (
str): the mode of permutation; one of:- 'all': permute all pixels.
- 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
- 'first_channel_only': permute only the first channel.
- permutation_seeds (
dict[int, int]|None): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default isNone, which creates a dict of seeds from 0 tonum_tasks-1.
original_dataset_python_class: type[torch.utils.data.dataset.Dataset] =
<class 'torchvision.datasets.celeba.CelebA'>
The original dataset class.
def
prepare_data(self) -> None:
83 def prepare_data(self) -> None: 84 r"""Download the original CelebA dataset if haven't.""" 85 if self.task_id != 1: 86 return # download all original datasets only at the beginning of first task 87 88 CelebA(root=self.root_t, split="train", target_type="identity", download=True) 89 CelebA(root=self.root_t, split="valid", target_type="identity", download=True) 90 CelebA(root=self.root_t, split="test", target_type="identity", download=True) 91 92 pylogger.debug( 93 "The original CelebA dataset has been downloaded to %s.", self.root_t 94 )
Download the original CelebA dataset if haven't.
def
train_and_val_dataset( self) -> tuple[torch.utils.data.dataset.Dataset, torch.utils.data.dataset.Dataset]:
96 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 97 """Get the training and validation dataset of task `self.task_id`. 98 99 **Returns:** 100 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`. 101 """ 102 dataset_train = CelebA( 103 root=self.root_t, 104 split="train", 105 target_type="identity", 106 transform=self.train_and_val_transforms(), 107 target_transform=self.target_transform(), 108 download=False, 109 ) 110 111 dataset_val = CelebA( 112 root=self.root_t, 113 split="valid", 114 target_type="identity", 115 transform=self.train_and_val_transforms(), 116 target_transform=self.target_transform(), 117 download=False, 118 ) 119 120 return dataset_train, dataset_val
Get the training and validation dataset of task self.task_id.
Returns:
- train_and_val_dataset (
tuple[Dataset, Dataset]): the train and validation dataset of taskself.task_id.
def
test_dataset(self) -> torch.utils.data.dataset.Dataset:
122 def test_dataset(self) -> Dataset: 123 r"""Get the test dataset of task `self.task_id`. 124 125 **Returns:** 126 - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`. 127 """ 128 dataset_test = CelebA( 129 root=self.root_t, 130 split="test", 131 target_type="identity", 132 transform=self.test_transforms(), 133 target_transform=self.target_transform(), 134 download=False, 135 ) 136 137 return dataset_test
Get the test dataset of task self.task_id.
Returns:
- test_dataset (
Dataset): the test dataset of taskself.task_id.