clarena.cl_datasets.permuted_pcam

The submodule in cl_datasets for Permuted PCAM dataset.

  1r"""
  2The submodule in `cl_datasets` for Permuted PCAM dataset.
  3"""
  4
  5__all__ = ["PermutedPCAM"]
  6
  7import logging
  8from typing import Callable
  9
 10from torch.utils.data import Dataset
 11from torchvision.datasets import PCAM
 12from torchvision.transforms import transforms
 13
 14from clarena.cl_datasets import CLPermutedDataset
 15
 16# always get logger for built-in logging in each module
 17pylogger = logging.getLogger(__name__)
 18
 19
 20class PermutedPCAM(CLPermutedDataset):
 21    r"""Permuted PCAM dataset. The [PCAM dataset](https://github.com/basveeling/pcam) is a collection of medical images of breast cancer. It consists of 327,680 images in 2 classes (benign and malignant), each 96x96 color image."""
 22
 23    original_dataset_python_class: type[Dataset] = PCAM
 24    r"""The original dataset class."""
 25
 26    def __init__(
 27        self,
 28        root: str,
 29        num_tasks: int,
 30        batch_size: int | dict[int, int] = 1,
 31        num_workers: int | dict[int, int] = 0,
 32        custom_transforms: (
 33            Callable
 34            | transforms.Compose
 35            | None
 36            | dict[int, Callable | transforms.Compose | None]
 37        ) = None,
 38        repeat_channels: int | None | dict[int, int | None] = None,
 39        to_tensor: bool | dict[int, bool] = True,
 40        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
 41        permutation_mode: str = "first_channel_only",
 42        permutation_seeds: dict[int, int] | None = None,
 43    ) -> None:
 44        r"""
 45        **Args:**
 46        - **root** (`str`): the root directory where the original PCAM data 'PCAM/' live.
 47        - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`.
 48        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
 49        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
 50        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
 51        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
 52        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
 53        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
 54        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
 55        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
 56        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
 57        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
 58        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
 59        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
 60        - **permutation_mode** (`str`): the mode of permutation; one of:
 61            1. 'all': permute all pixels.
 62            2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
 63            3. 'first_channel_only': permute only the first channel.
 64        - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1.
 65        """
 66
 67        super().__init__(
 68            root=root,
 69            num_tasks=num_tasks,
 70            batch_size=batch_size,
 71            num_workers=num_workers,
 72            custom_transforms=custom_transforms,
 73            repeat_channels=repeat_channels,
 74            to_tensor=to_tensor,
 75            resize=resize,
 76            permutation_mode=permutation_mode,
 77            permutation_seeds=permutation_seeds,
 78        )
 79
 80    def prepare_data(self) -> None:
 81        r"""Download the original PCAM dataset if haven't."""
 82
 83        if self.task_id != 1:
 84            return  # download all original datasets only at the beginning of first task
 85
 86        PCAM(root=self.root_t, split="train", download=True)
 87        PCAM(root=self.root_t, split="val", download=True)
 88        PCAM(root=self.root_t, split="test", download=True)
 89
 90        pylogger.debug(
 91            "The original PCAM dataset has been downloaded to %s.", self.root_t
 92        )
 93
 94    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
 95        """Get the training and validation dataset of task `self.task_id`.
 96
 97        **Returns:**
 98        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`.
 99        """
100        dataset_train = PCAM(
101            root=self.root_t,
102            split="train",
103            transform=self.train_and_val_transforms(),
104            target_transform=self.target_transform(),
105            download=False,
106        )
107
108        dataset_val = PCAM(
109            root=self.root_t,
110            split="val",
111            transform=self.train_and_val_transforms(),
112            download=False,
113        )
114
115        return dataset_train, dataset_val
116
117    def test_dataset(self) -> Dataset:
118        r"""Get the test dataset of task `self.task_id`.
119
120        **Returns:**
121        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
122        """
123        dataset_test = PCAM(
124            root=self.root_t,
125            split="test",
126            transform=self.test_transforms(),
127            target_transform=self.target_transform(),
128            download=False,
129        )
130
131        return dataset_test
class PermutedPCAM(clarena.cl_datasets.base.CLPermutedDataset):
 21class PermutedPCAM(CLPermutedDataset):
 22    r"""Permuted PCAM dataset. The [PCAM dataset](https://github.com/basveeling/pcam) is a collection of medical images of breast cancer. It consists of 327,680 images in 2 classes (benign and malignant), each 96x96 color image."""
 23
 24    original_dataset_python_class: type[Dataset] = PCAM
 25    r"""The original dataset class."""
 26
 27    def __init__(
 28        self,
 29        root: str,
 30        num_tasks: int,
 31        batch_size: int | dict[int, int] = 1,
 32        num_workers: int | dict[int, int] = 0,
 33        custom_transforms: (
 34            Callable
 35            | transforms.Compose
 36            | None
 37            | dict[int, Callable | transforms.Compose | None]
 38        ) = None,
 39        repeat_channels: int | None | dict[int, int | None] = None,
 40        to_tensor: bool | dict[int, bool] = True,
 41        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
 42        permutation_mode: str = "first_channel_only",
 43        permutation_seeds: dict[int, int] | None = None,
 44    ) -> None:
 45        r"""
 46        **Args:**
 47        - **root** (`str`): the root directory where the original PCAM data 'PCAM/' live.
 48        - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`.
 49        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
 50        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
 51        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
 52        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
 53        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
 54        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
 55        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
 56        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
 57        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
 58        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
 59        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
 60        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
 61        - **permutation_mode** (`str`): the mode of permutation; one of:
 62            1. 'all': permute all pixels.
 63            2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
 64            3. 'first_channel_only': permute only the first channel.
 65        - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1.
 66        """
 67
 68        super().__init__(
 69            root=root,
 70            num_tasks=num_tasks,
 71            batch_size=batch_size,
 72            num_workers=num_workers,
 73            custom_transforms=custom_transforms,
 74            repeat_channels=repeat_channels,
 75            to_tensor=to_tensor,
 76            resize=resize,
 77            permutation_mode=permutation_mode,
 78            permutation_seeds=permutation_seeds,
 79        )
 80
 81    def prepare_data(self) -> None:
 82        r"""Download the original PCAM dataset if haven't."""
 83
 84        if self.task_id != 1:
 85            return  # download all original datasets only at the beginning of first task
 86
 87        PCAM(root=self.root_t, split="train", download=True)
 88        PCAM(root=self.root_t, split="val", download=True)
 89        PCAM(root=self.root_t, split="test", download=True)
 90
 91        pylogger.debug(
 92            "The original PCAM dataset has been downloaded to %s.", self.root_t
 93        )
 94
 95    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
 96        """Get the training and validation dataset of task `self.task_id`.
 97
 98        **Returns:**
 99        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`.
100        """
101        dataset_train = PCAM(
102            root=self.root_t,
103            split="train",
104            transform=self.train_and_val_transforms(),
105            target_transform=self.target_transform(),
106            download=False,
107        )
108
109        dataset_val = PCAM(
110            root=self.root_t,
111            split="val",
112            transform=self.train_and_val_transforms(),
113            download=False,
114        )
115
116        return dataset_train, dataset_val
117
118    def test_dataset(self) -> Dataset:
119        r"""Get the test dataset of task `self.task_id`.
120
121        **Returns:**
122        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
123        """
124        dataset_test = PCAM(
125            root=self.root_t,
126            split="test",
127            transform=self.test_transforms(),
128            target_transform=self.target_transform(),
129            download=False,
130        )
131
132        return dataset_test

Permuted PCAM dataset. The PCAM dataset is a collection of medical images of breast cancer. It consists of 327,680 images in 2 classes (benign and malignant), each 96x96 color image.

PermutedPCAM( root: str, num_tasks: int, batch_size: int | dict[int, int] = 1, num_workers: int | dict[int, int] = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType, dict[int, Union[Callable, torchvision.transforms.transforms.Compose, NoneType]]] = None, repeat_channels: int | None | dict[int, int | None] = None, to_tensor: bool | dict[int, bool] = True, resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None, permutation_mode: str = 'first_channel_only', permutation_seeds: dict[int, int] | None = None)
27    def __init__(
28        self,
29        root: str,
30        num_tasks: int,
31        batch_size: int | dict[int, int] = 1,
32        num_workers: int | dict[int, int] = 0,
33        custom_transforms: (
34            Callable
35            | transforms.Compose
36            | None
37            | dict[int, Callable | transforms.Compose | None]
38        ) = None,
39        repeat_channels: int | None | dict[int, int | None] = None,
40        to_tensor: bool | dict[int, bool] = True,
41        resize: tuple[int, int] | None | dict[int, tuple[int, int] | None] = None,
42        permutation_mode: str = "first_channel_only",
43        permutation_seeds: dict[int, int] | None = None,
44    ) -> None:
45        r"""
46        **Args:**
47        - **root** (`str`): the root directory where the original PCAM data 'PCAM/' live.
48        - **num_tasks** (`int`): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to `num_tasks`.
49        - **batch_size** (`int` | `dict[int, int]`): the batch size for train, val, and test dataloaders.
50        If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an `int`, it is the same batch size for all tasks.
51        - **num_workers** (`int` | `dict[int, int]`): the number of workers for dataloaders.
52        If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an `int`, it is the same number of workers for all tasks.
53        - **custom_transforms** (`transform` or `transforms.Compose` or `None` or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. `ToTensor()`, normalization, permute, and so on are not included.
54        If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is `None`, no custom transforms are applied.
55        - **repeat_channels** (`int` | `None` | dict of them): the number of channels to repeat for each task. Default is `None`, which means no repeat.
56        If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an `int`, it is the same number of channels to repeat for all tasks. If it is `None`, no repeat is applied.
57        - **to_tensor** (`bool` | `dict[int, bool]`): whether to include the `ToTensor()` transform. Default is `True`.
58        If it is a dict, the keys are task IDs and the values are whether to include the `ToTensor()` transform for each task. If it is a single boolean value, it is applied to all tasks.
59        - **resize** (`tuple[int, int]` | `None` or dict of them): the size to resize the images to. Default is `None`, which means no resize.
60        If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is `None`, no resize is applied.
61        - **permutation_mode** (`str`): the mode of permutation; one of:
62            1. 'all': permute all pixels.
63            2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
64            3. 'first_channel_only': permute only the first channel.
65        - **permutation_seeds** (`dict[int, int]` | `None`): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is `None`, which creates a dict of seeds from 0 to `num_tasks`-1.
66        """
67
68        super().__init__(
69            root=root,
70            num_tasks=num_tasks,
71            batch_size=batch_size,
72            num_workers=num_workers,
73            custom_transforms=custom_transforms,
74            repeat_channels=repeat_channels,
75            to_tensor=to_tensor,
76            resize=resize,
77            permutation_mode=permutation_mode,
78            permutation_seeds=permutation_seeds,
79        )

Args:

  • root (str): the root directory where the original PCAM data 'PCAM/' live.
  • num_tasks (int): the maximum number of tasks supported by the CL dataset. This decides the valid task IDs from 1 to num_tasks.
  • batch_size (int | dict[int, int]): the batch size for train, val, and test dataloaders. If it is a dict, the keys are task IDs and the values are the batch sizes for each task. If it is an int, it is the same batch size for all tasks.
  • num_workers (int | dict[int, int]): the number of workers for dataloaders. If it is a dict, the keys are task IDs and the values are the number of workers for each task. If it is an int, it is the same number of workers for all tasks.
  • custom_transforms (transform or transforms.Compose or None or dict of them): the custom transforms to apply ONLY to the TRAIN dataset. Can be a single transform, composed transforms, or no transform. ToTensor(), normalization, permute, and so on are not included. If it is a dict, the keys are task IDs and the values are the custom transforms for each task. If it is a single transform or composed transforms, it is applied to all tasks. If it is None, no custom transforms are applied.
  • repeat_channels (int | None | dict of them): the number of channels to repeat for each task. Default is None, which means no repeat. If it is a dict, the keys are task IDs and the values are the number of channels to repeat for each task. If it is an int, it is the same number of channels to repeat for all tasks. If it is None, no repeat is applied.
  • to_tensor (bool | dict[int, bool]): whether to include the ToTensor() transform. Default is True. If it is a dict, the keys are task IDs and the values are whether to include the ToTensor() transform for each task. If it is a single boolean value, it is applied to all tasks.
  • resize (tuple[int, int] | None or dict of them): the size to resize the images to. Default is None, which means no resize. If it is a dict, the keys are task IDs and the values are the sizes to resize for each task. If it is a single tuple of two integers, it is applied to all tasks. If it is None, no resize is applied.
  • permutation_mode (str): the mode of permutation; one of:
    1. 'all': permute all pixels.
    2. 'by_channel': permute channel by channel separately. All channels are applied the same permutation order.
    3. 'first_channel_only': permute only the first channel.
  • permutation_seeds (dict[int, int] | None): the dict of seeds for permutation operations used to construct each task. Keys are task IDs and the values are permutation seeds for each task. Default is None, which creates a dict of seeds from 0 to num_tasks-1.
original_dataset_python_class: type[torch.utils.data.dataset.Dataset] = <class 'torchvision.datasets.pcam.PCAM'>

The original dataset class.

def prepare_data(self) -> None:
81    def prepare_data(self) -> None:
82        r"""Download the original PCAM dataset if haven't."""
83
84        if self.task_id != 1:
85            return  # download all original datasets only at the beginning of first task
86
87        PCAM(root=self.root_t, split="train", download=True)
88        PCAM(root=self.root_t, split="val", download=True)
89        PCAM(root=self.root_t, split="test", download=True)
90
91        pylogger.debug(
92            "The original PCAM dataset has been downloaded to %s.", self.root_t
93        )

Download the original PCAM dataset if haven't.

def train_and_val_dataset( self) -> tuple[torch.utils.data.dataset.Dataset, torch.utils.data.dataset.Dataset]:
 95    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
 96        """Get the training and validation dataset of task `self.task_id`.
 97
 98        **Returns:**
 99        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset of task `self.task_id`.
100        """
101        dataset_train = PCAM(
102            root=self.root_t,
103            split="train",
104            transform=self.train_and_val_transforms(),
105            target_transform=self.target_transform(),
106            download=False,
107        )
108
109        dataset_val = PCAM(
110            root=self.root_t,
111            split="val",
112            transform=self.train_and_val_transforms(),
113            download=False,
114        )
115
116        return dataset_train, dataset_val

Get the training and validation dataset of task self.task_id.

Returns:

  • train_and_val_dataset (tuple[Dataset, Dataset]): the train and validation dataset of task self.task_id.
def test_dataset(self) -> torch.utils.data.dataset.Dataset:
118    def test_dataset(self) -> Dataset:
119        r"""Get the test dataset of task `self.task_id`.
120
121        **Returns:**
122        - **test_dataset** (`Dataset`): the test dataset of task `self.task_id`.
123        """
124        dataset_test = PCAM(
125            root=self.root_t,
126            split="test",
127            transform=self.test_transforms(),
128            target_transform=self.target_transform(),
129            download=False,
130        )
131
132        return dataset_test

Get the test dataset of task self.task_id.

Returns:

  • test_dataset (Dataset): the test dataset of task self.task_id.