clarena.stl_datasets

Single-Task Learning Datasets

This submodule provides the single-task learning datasets that can be used in CLArena.

Here are the base classes for single-task learning datasets, which inherit from Lightning LightningDataModule:

  • STLDataset: The base class for all single-task learning datasets.

Please note that this is an API documantation. Please refer to the main documentation pages for more information about how to configure and implement single-task learning datasets:

 1r"""
 2
 3# Single-Task Learning Datasets
 4
 5This submodule provides the **single-task learning datasets** that can be used in CLArena.
 6
 7Here are the base classes for single-task learning datasets, which inherit from Lightning `LightningDataModule`:
 8
 9- `STLDataset`: The base class for all single-task learning datasets.
10    - `STLDatasetFromRaw`: The base class for constructing single-task learning datasets from raw datasets. A child class of `STLDataset`.
11
12Please note that this is an API documantation. Please refer to the main documentation pages for more information about how to configure and implement single-task learning datasets:
13
14- [**Configure STL Dataset**](https://pengxiang-wang.com/projects/continual-learning-arena/docs/components/STL-dataset)
15- [**Implement Custom STL Dataset**](https://pengxiang-wang.com/projects/continual-learning-arena/docs/custom-implementation/STL-dataset)
16
17"""
18
19from .base import STLDataset, STLDatasetFromRaw, TaskLabelledDataset
20
21from .ahdd import ArabicHandwrittenDigits
22from .caltech101 import Caltech101
23from .caltech256 import Caltech256
24from .celeba import CelebA
25from .cifar10 import CIFAR10
26from .cifar100 import CIFAR100
27from .country211 import Country211
28from .cub2002011 import CUB2002011
29from .dtd import DTD
30from .facescrub import FaceScrub
31from .fashionmnist import FashionMNIST
32from .fer2013 import FER2013
33from .fgvc_aircraft import FGVCAircraft
34from .flowers102 import Flowers102
35from .food101 import Food101
36from .emnist import EMNIST
37from .eurosat import EuroSAT
38from .gtsrb import GTSRB
39from .imagenette import Imagenette
40from .kannadamnist import KannadaMNIST
41from .kmnist import KMNIST
42from .linnaeus5 import Linnaeus5
43from .mnist import MNIST
44from .notmnist import NotMNIST
45from .oxford_iiit_pet import OxfordIIITPet
46from .pcam import PCAM
47from .renderedsst2 import RenderedSST2
48from .SEMEION import SEMEION
49from .sign_language_mnist import SignLanguageMNIST
50from .stanfordcars import StanfordCars
51from .sun397 import SUN397
52from .svhn import SVHN
53from .tinyimagenet import TinyImageNet
54from .usps import USPS
55
56
57__all__ = [
58    "STLDataset",
59    "STLDatasetFromRaw",
60    "TaskLabelledDataset",
61    "ahdd",
62    "caltech101",
63    "caltech256",
64    "celeba",
65    "cifar10",
66    "cifar100",
67    "country211",
68    "cub2002011",
69    "dtd",
70    "facescrub",
71    "fashionmnist",
72    "fer2013",
73    "fgvc_aircraft",
74    "flowers102",
75    "food101",
76    "emnist",
77    "eurosat",
78    "gtsrb",
79    "imagenette",
80    "kannadamnist",
81    "kmnist",
82    "linnaeus5",
83    "mnist",
84    "notmnist",
85    "oxford_iiit_pet",
86    "pcam",
87    "renderedsst2",
88    "SEMEION",
89    "sign_language_mnist",
90    "stanfordcars",
91    "sun397",
92    "svhn",
93    "tinyimagenet",
94    "usps",
95]
class STLDataset(lightning.pytorch.core.datamodule.LightningDataModule):
 31class STLDataset(LightningDataModule):
 32    r"""The base class of single-task learning datasets."""
 33
 34    def __init__(
 35        self,
 36        root: str,
 37        batch_size: int = 1,
 38        num_workers: int = 0,
 39        custom_transforms: Callable | transforms.Compose | None = None,
 40        repeat_channels: int | None = None,
 41        to_tensor: bool = True,
 42        resize: tuple[int, int] | None = None,
 43    ) -> None:
 44        r"""
 45        **Args:**
 46        - **root** (`str`): the root directory where the original data files for constructing the STL dataset physically live.
 47        - **batch_size** (`int`): The batch size in train, val, test dataloader.
 48        - **num_workers** (`int`): the number of workers for dataloaders.
 49        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
 50        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
 51        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
 52        - **resize** (`tuple[int, int]` | `None`): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
 53        """
 54        super().__init__()
 55
 56        self.root: str = root
 57        r"""The root directory of the original data files."""
 58        self.batch_size: int = batch_size
 59        r"""The batch size for dataloaders."""
 60        self.num_workers: int = num_workers
 61        r"""The number of workers for dataloaders."""
 62
 63        self.custom_transforms: Callable | transforms.Compose = custom_transforms
 64        r"""The custom transforms."""
 65        self.repeat_channels: int | None = repeat_channels
 66        r"""The number of channels to repeat."""
 67        self.to_tensor: bool = to_tensor
 68        r"""The to_tensor flag."""
 69        self.resize: tuple[int, int] | None = resize
 70        r"""The size to resize."""
 71
 72        self.dataset_train: Any
 73        r"""Training dataset object. Can be PyTorch Dataset objects or any other dataset objects."""
 74        self.dataset_val: Any
 75        r"""Validation dataset object. Can be PyTorch Dataset objects or any other dataset objects."""
 76        self.dataset_test: Any
 77        r"""Test dataset object. Can be PyTorch Dataset objects or any other dataset objects."""
 78        self.mean: float
 79        r"""Mean value for normalization."""
 80        self.std: float
 81        r"""Standard deviation value for normalization."""
 82
 83        STLDataset.sanity_check(self)
 84
 85    def sanity_check(self) -> None:
 86        r"""Sanity check."""
 87
 88    @abstractmethod
 89    def get_class_map(self) -> dict[str | int, int]:
 90        r"""Get the mapping of classes. It must be implemented by subclasses.
 91
 92        **Returns:**
 93        - **class_map**(`dict[str | int, int]`): the class map. Keys are original class labels and values are integer class labels for single-task learning.
 94        """
 95
 96    @abstractmethod
 97    def prepare_data(self) -> None:
 98        r"""Use this to download and prepare data. It must be implemented by subclasses, as required by `LightningDatamodule`."""
 99
100    def setup(self, stage: str) -> None:
101        r"""Set up the dataset for different stages.
102
103        **Args:**
104        - **stage** (`str`): the stage of the experiment; one of:
105            - 'fit': training and validation dataset should be assigned to `self.dataset_train` and `self.dataset_val`.
106            - 'test': test dataset should be assigned to `self.dataset_test`.
107        """
108        if stage == "fit":
109            # these two stages must be done together because a sanity check for validation is conducted before training
110            pylogger.debug("Construct train and validation dataset ...")
111
112            self.dataset_train, self.dataset_val = self.train_and_val_dataset()
113
114            pylogger.info("Train and validation dataset are ready.")
115            pylogger.info(
116                "Train dataset size: %d",
117                len(self.dataset_train),
118            )
119            pylogger.info(
120                "Validation dataset size: %d",
121                len(self.dataset_val),
122            )
123
124        elif stage == "test":
125
126            pylogger.debug("Construct test dataset ...")
127
128            self.dataset_test = self.test_dataset()
129
130            pylogger.info("Test dataset are ready.")
131            pylogger.info(
132                "Test dataset for size: %d",
133                len(self.dataset_test),
134            )
135
136    def setup_task(self) -> None:
137        r"""Set up the task for the dataset."""
138        pass
139
140    def train_and_val_transforms(self) -> transforms.Compose:
141        r"""Transforms for training and validation dataset, incorporating the custom transforms with basic transforms like normalization and `ToTensor()`. It can be used in subclasses when constructing the dataset.
142
143        **Returns:**
144        - **train_and_val_transforms** (`transforms.Compose`): the composed train/val transforms.
145        """
146        repeat_channels_transform = (
147            transforms.Grayscale(num_output_channels=self.repeat_channels)
148            if self.repeat_channels is not None
149            else None
150        )
151        to_tensor_transform = transforms.ToTensor() if self.to_tensor else None
152        resize_transform = (
153            transforms.Resize(self.resize) if self.resize is not None else None
154        )
155        normalization_transform = transforms.Normalize(self.mean, self.std)
156
157        return transforms.Compose(
158            list(
159                filter(
160                    None,
161                    [
162                        repeat_channels_transform,
163                        to_tensor_transform,
164                        resize_transform,
165                        self.custom_transforms,
166                        normalization_transform,
167                    ],
168                )
169            )
170        )  # the order of transforms matters
171
172    def test_transforms(self) -> transforms.Compose:
173        r"""Transforms for test dataset. Only basic transforms like normalization and `ToTensor()` are included. It can be used in subclasses when constructing the dataset.
174
175        **Returns:**
176        - **test_transforms** (`transforms.Compose`): the composed test transforms.
177        """
178        repeat_channels_transform = (
179            transforms.Grayscale(num_output_channels=self.repeat_channels)
180            if self.repeat_channels is not None
181            else None
182        )
183        to_tensor_transform = transforms.ToTensor() if self.to_tensor else None
184        resize_transform = (
185            transforms.Resize(self.resize) if self.resize is not None else None
186        )
187        normalization_transform = transforms.Normalize(self.mean, self.std)
188
189        return transforms.Compose(
190            list(
191                filter(
192                    None,
193                    [
194                        repeat_channels_transform,
195                        to_tensor_transform,
196                        resize_transform,
197                        normalization_transform,
198                    ],
199                )
200            )
201        )  # the order of transforms matters. No custom transforms for test
202
203    def target_transform(self) -> Callable:
204        r"""Target transform to map the original class labels to CL class labels. It can be used in subclasses when constructing the dataset.
205
206        **Returns:**
207        - **target_transform** (`Callable`): the target transform.
208        """
209        class_map = self.get_class_map()
210
211        target_transform = ClassMapping(class_map=class_map)
212
213        return target_transform
214
215    @abstractmethod
216    def train_and_val_dataset(self) -> tuple[Any, Any]:
217        r"""Get the training and validation dataset. It must be implemented by subclasses.
218
219        **Returns:**
220        - **train_and_val_dataset** (`tuple[Any, Any]`): the train and validation dataset.
221        """
222
223    @abstractmethod
224    def test_dataset(self) -> Any:
225        """Get the test dataset. It must be implemented by subclasses.
226
227        **Returns:**
228        - **test_dataset** (`Any`): the test dataset.
229        """
230
231    def train_dataloader(self) -> DataLoader:
232        r"""DataLoader generator for the stage train. It is automatically called before training.
233
234        **Returns:**
235        - **train_dataloader** (`DataLoader`): the train DataLoader.
236        """
237
238        pylogger.debug("Construct train dataloader ...")
239
240        return DataLoader(
241            dataset=self.dataset_train,
242            batch_size=self.batch_size,
243            shuffle=True,  # shuffle train batch to prevent overfitting
244            num_workers=self.num_workers,
245            drop_last=True,  # to avoid batchnorm error (when batch_size is 1)
246        )
247
248    def val_dataloader(self) -> DataLoader:
249        r"""DataLoader generator for the validation stage. It is automatically called before validation.
250
251        **Returns:**
252        - **val_dataloader** (`DataLoader`): the validation DataLoader.
253        """
254
255        pylogger.debug("Construct validation dataloader...")
256
257        return DataLoader(
258            dataset=self.dataset_val,
259            batch_size=self.batch_size,
260            shuffle=False,  # don't have to shuffle val or test batch
261            num_workers=self.num_workers,
262        )
263
264    def test_dataloader(self) -> dict[int, DataLoader]:
265        r"""DataLoader generator for stage test. It is automatically called before testing.
266
267        **Returns:**
268        - **test_dataloader** (`DataLoader`): the test DataLoader.
269        """
270
271        pylogger.debug("Construct test dataloader...")
272
273        return DataLoader(
274            dataset=self.dataset_test,
275            batch_size=self.batch_size,
276            shuffle=False,  # don't have to shuffle val or test batch
277            num_workers=self.num_workers,
278        )

The base class of single-task learning datasets.

STLDataset( root: str, batch_size: int = 1, num_workers: int = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType] = None, repeat_channels: int | None = None, to_tensor: bool = True, resize: tuple[int, int] | None = None)
34    def __init__(
35        self,
36        root: str,
37        batch_size: int = 1,
38        num_workers: int = 0,
39        custom_transforms: Callable | transforms.Compose | None = None,
40        repeat_channels: int | None = None,
41        to_tensor: bool = True,
42        resize: tuple[int, int] | None = None,
43    ) -> None:
44        r"""
45        **Args:**
46        - **root** (`str`): the root directory where the original data files for constructing the STL dataset physically live.
47        - **batch_size** (`int`): The batch size in train, val, test dataloader.
48        - **num_workers** (`int`): the number of workers for dataloaders.
49        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
50        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
51        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
52        - **resize** (`tuple[int, int]` | `None`): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
53        """
54        super().__init__()
55
56        self.root: str = root
57        r"""The root directory of the original data files."""
58        self.batch_size: int = batch_size
59        r"""The batch size for dataloaders."""
60        self.num_workers: int = num_workers
61        r"""The number of workers for dataloaders."""
62
63        self.custom_transforms: Callable | transforms.Compose = custom_transforms
64        r"""The custom transforms."""
65        self.repeat_channels: int | None = repeat_channels
66        r"""The number of channels to repeat."""
67        self.to_tensor: bool = to_tensor
68        r"""The to_tensor flag."""
69        self.resize: tuple[int, int] | None = resize
70        r"""The size to resize."""
71
72        self.dataset_train: Any
73        r"""Training dataset object. Can be PyTorch Dataset objects or any other dataset objects."""
74        self.dataset_val: Any
75        r"""Validation dataset object. Can be PyTorch Dataset objects or any other dataset objects."""
76        self.dataset_test: Any
77        r"""Test dataset object. Can be PyTorch Dataset objects or any other dataset objects."""
78        self.mean: float
79        r"""Mean value for normalization."""
80        self.std: float
81        r"""Standard deviation value for normalization."""
82
83        STLDataset.sanity_check(self)

Args:

  • root (str): the root directory where the original data files for constructing the STL dataset physically live.
  • batch_size (int): The batch size in train, val, test dataloader.
  • num_workers (int): the number of workers for dataloaders.
  • custom_transforms (transform or transforms.Compose or None): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. ToTensor(), normalize and so on are not included.
  • repeat_channels (int | None): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
  • to_tensor (bool): whether to include ToTensor() transform. Default is True.
  • resize (tuple[int, int] | None): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
root: str

The root directory of the original data files.

batch_size: int

The batch size for dataloaders.

num_workers: int

The number of workers for dataloaders.

custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose]

The custom transforms.

repeat_channels: int | None

The number of channels to repeat.

to_tensor: bool

The to_tensor flag.

resize: tuple[int, int] | None

The size to resize.

dataset_train: Any

Training dataset object. Can be PyTorch Dataset objects or any other dataset objects.

dataset_val: Any

Validation dataset object. Can be PyTorch Dataset objects or any other dataset objects.

dataset_test: Any

Test dataset object. Can be PyTorch Dataset objects or any other dataset objects.

mean: float

Mean value for normalization.

std: float

Standard deviation value for normalization.

def sanity_check(self) -> None:
85    def sanity_check(self) -> None:
86        r"""Sanity check."""

Sanity check.

@abstractmethod
def get_class_map(self) -> dict[str | int, int]:
88    @abstractmethod
89    def get_class_map(self) -> dict[str | int, int]:
90        r"""Get the mapping of classes. It must be implemented by subclasses.
91
92        **Returns:**
93        - **class_map**(`dict[str | int, int]`): the class map. Keys are original class labels and values are integer class labels for single-task learning.
94        """

Get the mapping of classes. It must be implemented by subclasses.

Returns:

  • class_map(dict[str | int, int]): the class map. Keys are original class labels and values are integer class labels for single-task learning.
@abstractmethod
def prepare_data(self) -> None:
96    @abstractmethod
97    def prepare_data(self) -> None:
98        r"""Use this to download and prepare data. It must be implemented by subclasses, as required by `LightningDatamodule`."""

Use this to download and prepare data. It must be implemented by subclasses, as required by LightningDatamodule.

def setup(self, stage: str) -> None:
100    def setup(self, stage: str) -> None:
101        r"""Set up the dataset for different stages.
102
103        **Args:**
104        - **stage** (`str`): the stage of the experiment; one of:
105            - 'fit': training and validation dataset should be assigned to `self.dataset_train` and `self.dataset_val`.
106            - 'test': test dataset should be assigned to `self.dataset_test`.
107        """
108        if stage == "fit":
109            # these two stages must be done together because a sanity check for validation is conducted before training
110            pylogger.debug("Construct train and validation dataset ...")
111
112            self.dataset_train, self.dataset_val = self.train_and_val_dataset()
113
114            pylogger.info("Train and validation dataset are ready.")
115            pylogger.info(
116                "Train dataset size: %d",
117                len(self.dataset_train),
118            )
119            pylogger.info(
120                "Validation dataset size: %d",
121                len(self.dataset_val),
122            )
123
124        elif stage == "test":
125
126            pylogger.debug("Construct test dataset ...")
127
128            self.dataset_test = self.test_dataset()
129
130            pylogger.info("Test dataset are ready.")
131            pylogger.info(
132                "Test dataset for size: %d",
133                len(self.dataset_test),
134            )

Set up the dataset for different stages.

Args:

  • stage (str): the stage of the experiment; one of:
    • 'fit': training and validation dataset should be assigned to self.dataset_train and self.dataset_val.
    • 'test': test dataset should be assigned to self.dataset_test.
def setup_task(self) -> None:
136    def setup_task(self) -> None:
137        r"""Set up the task for the dataset."""
138        pass

Set up the task for the dataset.

def train_and_val_transforms(self) -> torchvision.transforms.transforms.Compose:
140    def train_and_val_transforms(self) -> transforms.Compose:
141        r"""Transforms for training and validation dataset, incorporating the custom transforms with basic transforms like normalization and `ToTensor()`. It can be used in subclasses when constructing the dataset.
142
143        **Returns:**
144        - **train_and_val_transforms** (`transforms.Compose`): the composed train/val transforms.
145        """
146        repeat_channels_transform = (
147            transforms.Grayscale(num_output_channels=self.repeat_channels)
148            if self.repeat_channels is not None
149            else None
150        )
151        to_tensor_transform = transforms.ToTensor() if self.to_tensor else None
152        resize_transform = (
153            transforms.Resize(self.resize) if self.resize is not None else None
154        )
155        normalization_transform = transforms.Normalize(self.mean, self.std)
156
157        return transforms.Compose(
158            list(
159                filter(
160                    None,
161                    [
162                        repeat_channels_transform,
163                        to_tensor_transform,
164                        resize_transform,
165                        self.custom_transforms,
166                        normalization_transform,
167                    ],
168                )
169            )
170        )  # the order of transforms matters

Transforms for training and validation dataset, incorporating the custom transforms with basic transforms like normalization and ToTensor(). It can be used in subclasses when constructing the dataset.

Returns:

  • train_and_val_transforms (transforms.Compose): the composed train/val transforms.
def test_transforms(self) -> torchvision.transforms.transforms.Compose:
172    def test_transforms(self) -> transforms.Compose:
173        r"""Transforms for test dataset. Only basic transforms like normalization and `ToTensor()` are included. It can be used in subclasses when constructing the dataset.
174
175        **Returns:**
176        - **test_transforms** (`transforms.Compose`): the composed test transforms.
177        """
178        repeat_channels_transform = (
179            transforms.Grayscale(num_output_channels=self.repeat_channels)
180            if self.repeat_channels is not None
181            else None
182        )
183        to_tensor_transform = transforms.ToTensor() if self.to_tensor else None
184        resize_transform = (
185            transforms.Resize(self.resize) if self.resize is not None else None
186        )
187        normalization_transform = transforms.Normalize(self.mean, self.std)
188
189        return transforms.Compose(
190            list(
191                filter(
192                    None,
193                    [
194                        repeat_channels_transform,
195                        to_tensor_transform,
196                        resize_transform,
197                        normalization_transform,
198                    ],
199                )
200            )
201        )  # the order of transforms matters. No custom transforms for test

Transforms for test dataset. Only basic transforms like normalization and ToTensor() are included. It can be used in subclasses when constructing the dataset.

Returns:

  • test_transforms (transforms.Compose): the composed test transforms.
def target_transform(self) -> Callable:
203    def target_transform(self) -> Callable:
204        r"""Target transform to map the original class labels to CL class labels. It can be used in subclasses when constructing the dataset.
205
206        **Returns:**
207        - **target_transform** (`Callable`): the target transform.
208        """
209        class_map = self.get_class_map()
210
211        target_transform = ClassMapping(class_map=class_map)
212
213        return target_transform

Target transform to map the original class labels to CL class labels. It can be used in subclasses when constructing the dataset.

Returns:

  • target_transform (Callable): the target transform.
@abstractmethod
def train_and_val_dataset(self) -> tuple[typing.Any, typing.Any]:
215    @abstractmethod
216    def train_and_val_dataset(self) -> tuple[Any, Any]:
217        r"""Get the training and validation dataset. It must be implemented by subclasses.
218
219        **Returns:**
220        - **train_and_val_dataset** (`tuple[Any, Any]`): the train and validation dataset.
221        """

Get the training and validation dataset. It must be implemented by subclasses.

Returns:

  • train_and_val_dataset (tuple[Any, Any]): the train and validation dataset.
@abstractmethod
def test_dataset(self) -> Any:
223    @abstractmethod
224    def test_dataset(self) -> Any:
225        """Get the test dataset. It must be implemented by subclasses.
226
227        **Returns:**
228        - **test_dataset** (`Any`): the test dataset.
229        """

Get the test dataset. It must be implemented by subclasses.

Returns:

  • test_dataset (Any): the test dataset.
def train_dataloader(self) -> torch.utils.data.dataloader.DataLoader:
231    def train_dataloader(self) -> DataLoader:
232        r"""DataLoader generator for the stage train. It is automatically called before training.
233
234        **Returns:**
235        - **train_dataloader** (`DataLoader`): the train DataLoader.
236        """
237
238        pylogger.debug("Construct train dataloader ...")
239
240        return DataLoader(
241            dataset=self.dataset_train,
242            batch_size=self.batch_size,
243            shuffle=True,  # shuffle train batch to prevent overfitting
244            num_workers=self.num_workers,
245            drop_last=True,  # to avoid batchnorm error (when batch_size is 1)
246        )

DataLoader generator for the stage train. It is automatically called before training.

Returns:

  • train_dataloader (DataLoader): the train DataLoader.
def val_dataloader(self) -> torch.utils.data.dataloader.DataLoader:
248    def val_dataloader(self) -> DataLoader:
249        r"""DataLoader generator for the validation stage. It is automatically called before validation.
250
251        **Returns:**
252        - **val_dataloader** (`DataLoader`): the validation DataLoader.
253        """
254
255        pylogger.debug("Construct validation dataloader...")
256
257        return DataLoader(
258            dataset=self.dataset_val,
259            batch_size=self.batch_size,
260            shuffle=False,  # don't have to shuffle val or test batch
261            num_workers=self.num_workers,
262        )

DataLoader generator for the validation stage. It is automatically called before validation.

Returns:

  • val_dataloader (DataLoader): the validation DataLoader.
def test_dataloader(self) -> dict[int, torch.utils.data.dataloader.DataLoader]:
264    def test_dataloader(self) -> dict[int, DataLoader]:
265        r"""DataLoader generator for stage test. It is automatically called before testing.
266
267        **Returns:**
268        - **test_dataloader** (`DataLoader`): the test DataLoader.
269        """
270
271        pylogger.debug("Construct test dataloader...")
272
273        return DataLoader(
274            dataset=self.dataset_test,
275            batch_size=self.batch_size,
276            shuffle=False,  # don't have to shuffle val or test batch
277            num_workers=self.num_workers,
278        )

DataLoader generator for stage test. It is automatically called before testing.

Returns:

  • test_dataloader (DataLoader): the test DataLoader.
class STLDatasetFromRaw(clarena.stl_datasets.STLDataset):
281class STLDatasetFromRaw(STLDataset):
282    r"""The base class of single-task learning datasets from raw PyTorch Dataset."""
283
284    original_dataset_python_class: type[Dataset]
285    r"""The original dataset class. **It must be provided in subclasses.**"""
286
287    def __init__(
288        self,
289        root: str,
290        batch_size: int = 1,
291        num_workers: int = 0,
292        custom_transforms: Callable | transforms.Compose | None = None,
293        repeat_channels: int | None = None,
294        to_tensor: bool = True,
295        resize: tuple[int, int] | None = None,
296    ) -> None:
297        r"""
298        **Args:**
299        - **root** (`str`): the root directory where the original data files for constructing the STL dataset physically live.
300        - **batch_size** (`int`): The batch size in train, val, test dataloader.
301        - **num_workers** (`int`): the number of workers for dataloaders.
302        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
303        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
304        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
305        - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
306        """
307        super().__init__(
308            root=root,
309            batch_size=batch_size,
310            num_workers=num_workers,
311            custom_transforms=custom_transforms,
312            repeat_channels=repeat_channels,
313            to_tensor=to_tensor,
314            resize=resize,
315        )
316
317        self.original_dataset_constants: type[DatasetConstants] = (
318            DATASET_CONSTANTS_MAPPING[self.original_dataset_python_class]
319        )
320        r"""The original dataset constants class."""
321
322    def get_class_map(self) -> dict[str | int, int]:
323        r"""Get the mapping of classes.
324
325        **Returns:**
326        - **class_map**(`dict[str | int, int]`): the class map. Key is original class label, value is integer class label for single-task learning.
327        """
328        return self.original_dataset_constants.CLASS_MAP
329
330    def setup_task(self) -> None:
331        r"""Set up the task for the dataset."""
332        super().setup_task()
333
334        self.mean = (
335            self.original_dataset_constants.MEAN
336        )  # the same with the original dataset
337        self.std = (
338            self.original_dataset_constants.STD
339        )  # the same with the original dataset

The base class of single-task learning datasets from raw PyTorch Dataset.

STLDatasetFromRaw( root: str, batch_size: int = 1, num_workers: int = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType] = None, repeat_channels: int | None = None, to_tensor: bool = True, resize: tuple[int, int] | None = None)
287    def __init__(
288        self,
289        root: str,
290        batch_size: int = 1,
291        num_workers: int = 0,
292        custom_transforms: Callable | transforms.Compose | None = None,
293        repeat_channels: int | None = None,
294        to_tensor: bool = True,
295        resize: tuple[int, int] | None = None,
296    ) -> None:
297        r"""
298        **Args:**
299        - **root** (`str`): the root directory where the original data files for constructing the STL dataset physically live.
300        - **batch_size** (`int`): The batch size in train, val, test dataloader.
301        - **num_workers** (`int`): the number of workers for dataloaders.
302        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
303        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
304        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
305        - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
306        """
307        super().__init__(
308            root=root,
309            batch_size=batch_size,
310            num_workers=num_workers,
311            custom_transforms=custom_transforms,
312            repeat_channels=repeat_channels,
313            to_tensor=to_tensor,
314            resize=resize,
315        )
316
317        self.original_dataset_constants: type[DatasetConstants] = (
318            DATASET_CONSTANTS_MAPPING[self.original_dataset_python_class]
319        )
320        r"""The original dataset constants class."""

Args:

  • root (str): the root directory where the original data files for constructing the STL dataset physically live.
  • batch_size (int): The batch size in train, val, test dataloader.
  • num_workers (int): the number of workers for dataloaders.
  • custom_transforms (transform or transforms.Compose or None): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. ToTensor(), normalize and so on are not included.
  • repeat_channels (int | None): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
  • to_tensor (bool): whether to include ToTensor() transform. Default is True.
  • resize (tuple[int, int] | None or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
original_dataset_python_class: type[torch.utils.data.dataset.Dataset]

The original dataset class. It must be provided in subclasses.

original_dataset_constants: type[clarena.stl_datasets.raw.constants.DatasetConstants]

The original dataset constants class.

def get_class_map(self) -> dict[str | int, int]:
322    def get_class_map(self) -> dict[str | int, int]:
323        r"""Get the mapping of classes.
324
325        **Returns:**
326        - **class_map**(`dict[str | int, int]`): the class map. Key is original class label, value is integer class label for single-task learning.
327        """
328        return self.original_dataset_constants.CLASS_MAP

Get the mapping of classes.

Returns:

  • class_map(dict[str | int, int]): the class map. Key is original class label, value is integer class label for single-task learning.
def setup_task(self) -> None:
330    def setup_task(self) -> None:
331        r"""Set up the task for the dataset."""
332        super().setup_task()
333
334        self.mean = (
335            self.original_dataset_constants.MEAN
336        )  # the same with the original dataset
337        self.std = (
338            self.original_dataset_constants.STD
339        )  # the same with the original dataset

Set up the task for the dataset.

class TaskLabelledDataset(typing.Generic[+_T_co]):
342class TaskLabelledDataset(Dataset):
343    r"""The dataset class that labels the a task's dataset with the given task ID. It is used to label the dataset with the task ID for MTL experiment."""
344
345    def __init__(self, dataset: Dataset, task_id: int) -> None:
346        r"""
347        **Args:**
348        - **dataset** (`Dataset`): the dataset to be labelled.
349        - **task_id** (`int`): the task ID to be labelled.
350        """
351        super().__init__()
352
353        self.dataset: Dataset = dataset
354        r"""The original dataset object."""
355        self.task_id: int = task_id
356        r"""The task ID."""
357
358    def __len__(self) -> int:
359        r"""The length of the dataset.
360
361        **Returns:**
362        - **length** (`int`): the length of the dataset.
363        """
364
365        return len(self.dataset)  # the same as the length of the original dataset.
366
367    def __getitem__(self, idx) -> tuple[Any, Any, int]:
368        r"""Get the item from the dataset. Labelled with the task ID.
369
370        **Args:**
371        - **idx** (`int`): the index of the item to be retrieved.
372
373        **Returns:**
374        - **x** (`Any`): the input data.
375        - **y** (`Any`): the target data.
376        - **task_id** (`int`): the task ID.
377        """
378        x, y = self.dataset[idx]
379        return x, y, self.task_id

The dataset class that labels the a task's dataset with the given task ID. It is used to label the dataset with the task ID for MTL experiment.

TaskLabelledDataset(dataset: torch.utils.data.dataset.Dataset, task_id: int)
345    def __init__(self, dataset: Dataset, task_id: int) -> None:
346        r"""
347        **Args:**
348        - **dataset** (`Dataset`): the dataset to be labelled.
349        - **task_id** (`int`): the task ID to be labelled.
350        """
351        super().__init__()
352
353        self.dataset: Dataset = dataset
354        r"""The original dataset object."""
355        self.task_id: int = task_id
356        r"""The task ID."""

Args:

  • dataset (Dataset): the dataset to be labelled.
  • task_id (int): the task ID to be labelled.
dataset: torch.utils.data.dataset.Dataset

The original dataset object.

task_id: int

The task ID.

class SEMEION(clarena.stl_datasets.STLDatasetFromRaw):
 22class SEMEION(STLDatasetFromRaw):
 23    r"""SEMEION dataset. The [SEMEION dataset](https://archive.ics.uci.edu/dataset/178/semeion+handwritten+digit) is a collection of handwritten digits. It consists of 1,593 handwritten digit images (10 classes), each 16x16 grayscale image."""
 24
 25    original_dataset_python_class: type[Dataset] = SEMEIONRaw
 26    r"""The original dataset class."""
 27
 28    def __init__(
 29        self,
 30        root: str,
 31        test_percentage: float,
 32        validation_percentage: float,
 33        batch_size: int = 1,
 34        num_workers: int = 0,
 35        custom_transforms: Callable | transforms.Compose | None = None,
 36        repeat_channels: int | None = None,
 37        to_tensor: bool = True,
 38        resize: tuple[int, int] | None = None,
 39    ) -> None:
 40        r"""
 41        **Args:**
 42        - **root** (`str`): the root directory where the original SEMEION data 'SEMEION/' live.
 43        - **test_percentage** (`float`): the percentage to randomly split some data into test data.
 44        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
 45        - **batch_size** (`int`): The batch size in train, val, test dataloader.
 46        - **num_workers** (`int`): the number of workers for dataloaders.
 47        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
 48        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
 49        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
 50        - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
 51        """
 52        super().__init__(
 53            root=root,
 54            batch_size=batch_size,
 55            num_workers=num_workers,
 56            custom_transforms=custom_transforms,
 57            repeat_channels=repeat_channels,
 58            to_tensor=to_tensor,
 59            resize=resize,
 60        )
 61
 62        self.test_percentage: float = test_percentage
 63        r"""The percentage to randomly split some data into test data."""
 64        self.validation_percentage: float = validation_percentage
 65        r"""The percentage to randomly split some training data into validation data."""
 66
 67    def prepare_data(self) -> None:
 68        r"""Download the original SEMEION dataset if haven't."""
 69
 70        SEMEIONRaw(root=self.root, download=True)
 71
 72        pylogger.debug(
 73            "The original SEMEION dataset has been downloaded to %s.", self.root
 74        )
 75
 76    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
 77        """Get the training and validation dataset.
 78
 79        **Returns:**
 80        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset.
 81        """
 82        dataset_all = SEMEIONRaw(
 83            root=self.root,
 84            transform=self.train_and_val_transforms(),
 85            target_transform=self.target_transform(),
 86            download=False,
 87        )
 88
 89        dataset_train_and_val, _ = random_split(
 90            dataset_all,
 91            lengths=[
 92                1 - self.test_percentage,
 93                self.test_percentage,
 94            ],
 95            generator=torch.Generator().manual_seed(
 96                42
 97            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
 98        )
 99
100        return random_split(
101            dataset_train_and_val,
102            lengths=[1 - self.validation_percentage, self.validation_percentage],
103            generator=torch.Generator().manual_seed(
104                42
105            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
106        )
107
108    def test_dataset(self) -> Dataset:
109        r"""Get the test dataset.
110
111        **Returns:**
112        - **test_dataset** (`Dataset`): the test dataset.
113        """
114        dataset_all = SEMEIONRaw(
115            root=self.root,
116            transform=self.train_and_val_transforms(),
117            target_transform=self.target_transform(),
118            download=False,
119        )
120
121        _, dataset_test = random_split(
122            dataset_all,
123            lengths=[1 - self.test_percentage, self.test_percentage],
124            generator=torch.Generator().manual_seed(42),
125        )
126
127        return dataset_test

SEMEION dataset. The SEMEION dataset is a collection of handwritten digits. It consists of 1,593 handwritten digit images (10 classes), each 16x16 grayscale image.

SEMEION( root: str, test_percentage: float, validation_percentage: float, batch_size: int = 1, num_workers: int = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType] = None, repeat_channels: int | None = None, to_tensor: bool = True, resize: tuple[int, int] | None = None)
28    def __init__(
29        self,
30        root: str,
31        test_percentage: float,
32        validation_percentage: float,
33        batch_size: int = 1,
34        num_workers: int = 0,
35        custom_transforms: Callable | transforms.Compose | None = None,
36        repeat_channels: int | None = None,
37        to_tensor: bool = True,
38        resize: tuple[int, int] | None = None,
39    ) -> None:
40        r"""
41        **Args:**
42        - **root** (`str`): the root directory where the original SEMEION data 'SEMEION/' live.
43        - **test_percentage** (`float`): the percentage to randomly split some data into test data.
44        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
45        - **batch_size** (`int`): The batch size in train, val, test dataloader.
46        - **num_workers** (`int`): the number of workers for dataloaders.
47        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
48        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
49        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
50        - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
51        """
52        super().__init__(
53            root=root,
54            batch_size=batch_size,
55            num_workers=num_workers,
56            custom_transforms=custom_transforms,
57            repeat_channels=repeat_channels,
58            to_tensor=to_tensor,
59            resize=resize,
60        )
61
62        self.test_percentage: float = test_percentage
63        r"""The percentage to randomly split some data into test data."""
64        self.validation_percentage: float = validation_percentage
65        r"""The percentage to randomly split some training data into validation data."""

Args:

  • root (str): the root directory where the original SEMEION data 'SEMEION/' live.
  • test_percentage (float): the percentage to randomly split some data into test data.
  • validation_percentage (float): the percentage to randomly split some training data into validation data.
  • batch_size (int): The batch size in train, val, test dataloader.
  • num_workers (int): the number of workers for dataloaders.
  • custom_transforms (transform or transforms.Compose or None): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. ToTensor(), normalize and so on are not included.
  • repeat_channels (int | None): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
  • to_tensor (bool): whether to include ToTensor() transform. Default is True.
  • resize (tuple[int, int] | None or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
original_dataset_python_class: type[torch.utils.data.dataset.Dataset] = <class 'torchvision.datasets.semeion.SEMEION'>

The original dataset class.

test_percentage: float

The percentage to randomly split some data into test data.

validation_percentage: float

The percentage to randomly split some training data into validation data.

def prepare_data(self) -> None:
67    def prepare_data(self) -> None:
68        r"""Download the original SEMEION dataset if haven't."""
69
70        SEMEIONRaw(root=self.root, download=True)
71
72        pylogger.debug(
73            "The original SEMEION dataset has been downloaded to %s.", self.root
74        )

Download the original SEMEION dataset if haven't.

def train_and_val_dataset( self) -> tuple[torch.utils.data.dataset.Dataset, torch.utils.data.dataset.Dataset]:
 76    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
 77        """Get the training and validation dataset.
 78
 79        **Returns:**
 80        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset.
 81        """
 82        dataset_all = SEMEIONRaw(
 83            root=self.root,
 84            transform=self.train_and_val_transforms(),
 85            target_transform=self.target_transform(),
 86            download=False,
 87        )
 88
 89        dataset_train_and_val, _ = random_split(
 90            dataset_all,
 91            lengths=[
 92                1 - self.test_percentage,
 93                self.test_percentage,
 94            ],
 95            generator=torch.Generator().manual_seed(
 96                42
 97            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
 98        )
 99
100        return random_split(
101            dataset_train_and_val,
102            lengths=[1 - self.validation_percentage, self.validation_percentage],
103            generator=torch.Generator().manual_seed(
104                42
105            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
106        )

Get the training and validation dataset.

Returns:

  • train_and_val_dataset (tuple[Dataset, Dataset]): the train and validation dataset.
def test_dataset(self) -> torch.utils.data.dataset.Dataset:
108    def test_dataset(self) -> Dataset:
109        r"""Get the test dataset.
110
111        **Returns:**
112        - **test_dataset** (`Dataset`): the test dataset.
113        """
114        dataset_all = SEMEIONRaw(
115            root=self.root,
116            transform=self.train_and_val_transforms(),
117            target_transform=self.target_transform(),
118            download=False,
119        )
120
121        _, dataset_test = random_split(
122            dataset_all,
123            lengths=[1 - self.test_percentage, self.test_percentage],
124            generator=torch.Generator().manual_seed(42),
125        )
126
127        return dataset_test

Get the test dataset.

Returns:

  • test_dataset (Dataset): the test dataset.