clarena.stl_datasets

Single-Task Learning Datasets

This submodule provides the single-task learning datasets that can be used in CLArena.

Here are the base classes for single-task learning datasets, which inherit from Lightning LightningDataModule:

STLDataset: The base class for all single-task learning datasets.
- STLDatasetFromRaw: The base class for constructing single-task learning datasets from raw datasets. A child class of STLDataset.

Please note that this is an API documantation. Please refer to the main documentation pages for more information about how to configure and implement single-task learning datasets:

View Source

 1r"""
 2
 3# Single-Task Learning Datasets
 4
 5This submodule provides the **single-task learning datasets** that can be used in CLArena.
 6
 7Here are the base classes for single-task learning datasets, which inherit from Lightning `LightningDataModule`:
 8
 9- `STLDataset`: The base class for all single-task learning datasets.
10    - `STLDatasetFromRaw`: The base class for constructing single-task learning datasets from raw datasets. A child class of `STLDataset`.
11
12Please note that this is an API documantation. Please refer to the main documentation pages for more information about how to configure and implement single-task learning datasets:
13
14- [**Configure STL Dataset**](https://pengxiang-wang.com/projects/continual-learning-arena/docs/components/STL-dataset)
15- [**Implement Custom STL Dataset**](https://pengxiang-wang.com/projects/continual-learning-arena/docs/custom-implementation/STL-dataset)
16
17"""
18
19from .base import STLDataset, STLDatasetFromRaw, TaskLabelledDataset
20
21from .ahdd import ArabicHandwrittenDigits
22from .caltech101 import Caltech101
23from .caltech256 import Caltech256
24from .celeba import CelebA
25from .cifar10 import CIFAR10
26from .cifar100 import CIFAR100
27from .country211 import Country211
28from .cub2002011 import CUB2002011
29from .dtd import DTD
30from .facescrub import FaceScrub
31from .fashionmnist import FashionMNIST
32from .fer2013 import FER2013
33from .fgvc_aircraft import FGVCAircraft
34from .flowers102 import Flowers102
35from .food101 import Food101
36from .emnist import EMNIST
37from .eurosat import EuroSAT
38from .gtsrb import GTSRB
39from .imagenette import Imagenette
40from .kannadamnist import KannadaMNIST
41from .kmnist import KMNIST
42from .linnaeus5 import Linnaeus5
43from .mnist import MNIST
44from .notmnist import NotMNIST
45from .oxford_iiit_pet import OxfordIIITPet
46from .pcam import PCAM
47from .renderedsst2 import RenderedSST2
48from .SEMEION import SEMEION
49from .sign_language_mnist import SignLanguageMNIST
50from .stanfordcars import StanfordCars
51from .sun397 import SUN397
52from .svhn import SVHN
53from .tinyimagenet import TinyImageNet
54from .usps import USPS
55
56
57__all__ = [
58    "STLDataset",
59    "STLDatasetFromRaw",
60    "TaskLabelledDataset",
61    "ahdd",
62    "caltech101",
63    "caltech256",
64    "celeba",
65    "cifar10",
66    "cifar100",
67    "country211",
68    "cub2002011",
69    "dtd",
70    "facescrub",
71    "fashionmnist",
72    "fer2013",
73    "fgvc_aircraft",
74    "flowers102",
75    "food101",
76    "emnist",
77    "eurosat",
78    "gtsrb",
79    "imagenette",
80    "kannadamnist",
81    "kmnist",
82    "linnaeus5",
83    "mnist",
84    "notmnist",
85    "oxford_iiit_pet",
86    "pcam",
87    "renderedsst2",
88    "SEMEION",
89    "sign_language_mnist",
90    "stanfordcars",
91    "sun397",
92    "svhn",
93    "tinyimagenet",
94    "usps",
95]

class STLDatasetFromRaw(clarena.stl_datasets.STLDataset): View Source

281class STLDatasetFromRaw(STLDataset):
282    r"""The base class of single-task learning datasets from raw PyTorch Dataset."""
283
284    original_dataset_python_class: type[Dataset]
285    r"""The original dataset class. **It must be provided in subclasses.**"""
286
287    def __init__(
288        self,
289        root: str,
290        batch_size: int = 1,
291        num_workers: int = 0,
292        custom_transforms: Callable | transforms.Compose | None = None,
293        repeat_channels: int | None = None,
294        to_tensor: bool = True,
295        resize: tuple[int, int] | None = None,
296    ) -> None:
297        r"""
298        **Args:**
299        - **root** (`str`): the root directory where the original data files for constructing the STL dataset physically live.
300        - **batch_size** (`int`): The batch size in train, val, test dataloader.
301        - **num_workers** (`int`): the number of workers for dataloaders.
302        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
303        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
304        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
305        - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
306        """
307        super().__init__(
308            root=root,
309            batch_size=batch_size,
310            num_workers=num_workers,
311            custom_transforms=custom_transforms,
312            repeat_channels=repeat_channels,
313            to_tensor=to_tensor,
314            resize=resize,
315        )
316
317        self.original_dataset_constants: type[DatasetConstants] = (
318            DATASET_CONSTANTS_MAPPING[self.original_dataset_python_class]
319        )
320        r"""The original dataset constants class."""
321
322    def get_class_map(self) -> dict[str | int, int]:
323        r"""Get the mapping of classes.
324
325        **Returns:**
326        - **class_map**(`dict[str | int, int]`): the class map. Key is original class label, value is integer class label for single-task learning.
327        """
328        return self.original_dataset_constants.CLASS_MAP
329
330    def setup_task(self) -> None:
331        r"""Set up the task for the dataset."""
332        super().setup_task()
333
334        self.mean = (
335            self.original_dataset_constants.MEAN
336        )  # the same with the original dataset
337        self.std = (
338            self.original_dataset_constants.STD
339        )  # the same with the original dataset

The base class of single-task learning datasets from raw PyTorch Dataset.

STLDatasetFromRaw( root: str, batch_size: int = 1, num_workers: int = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType] = None, repeat_channels: int | None = None, to_tensor: bool = True, resize: tuple[int, int] | None = None) View Source

287    def __init__(
288        self,
289        root: str,
290        batch_size: int = 1,
291        num_workers: int = 0,
292        custom_transforms: Callable | transforms.Compose | None = None,
293        repeat_channels: int | None = None,
294        to_tensor: bool = True,
295        resize: tuple[int, int] | None = None,
296    ) -> None:
297        r"""
298        **Args:**
299        - **root** (`str`): the root directory where the original data files for constructing the STL dataset physically live.
300        - **batch_size** (`int`): The batch size in train, val, test dataloader.
301        - **num_workers** (`int`): the number of workers for dataloaders.
302        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
303        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
304        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
305        - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
306        """
307        super().__init__(
308            root=root,
309            batch_size=batch_size,
310            num_workers=num_workers,
311            custom_transforms=custom_transforms,
312            repeat_channels=repeat_channels,
313            to_tensor=to_tensor,
314            resize=resize,
315        )
316
317        self.original_dataset_constants: type[DatasetConstants] = (
318            DATASET_CONSTANTS_MAPPING[self.original_dataset_python_class]
319        )
320        r"""The original dataset constants class."""

Args:

root (str): the root directory where the original data files for constructing the STL dataset physically live.
batch_size (int): The batch size in train, val, test dataloader.
num_workers (int): the number of workers for dataloaders.
custom_transforms (transform or transforms.Compose or None): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. ToTensor(), normalize and so on are not included.
repeat_channels (int | None): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
to_tensor (bool): whether to include ToTensor() transform. Default is True.
resize (tuple[int, int] | None or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.

original_dataset_python_class: type[torch.utils.data.dataset.Dataset]

The original dataset class. It must be provided in subclasses.

original_dataset_constants: type[clarena.stl_datasets.raw.constants.DatasetConstants]

The original dataset constants class.

def get_class_map(self) -> dict[str | int, int]: View Source

322    def get_class_map(self) -> dict[str | int, int]:
323        r"""Get the mapping of classes.
324
325        **Returns:**
326        - **class_map**(`dict[str | int, int]`): the class map. Key is original class label, value is integer class label for single-task learning.
327        """
328        return self.original_dataset_constants.CLASS_MAP

Get the mapping of classes.

Returns:

class_map(dict[str | int, int]): the class map. Key is original class label, value is integer class label for single-task learning.

def setup_task(self) -> None: View Source

330    def setup_task(self) -> None:
331        r"""Set up the task for the dataset."""
332        super().setup_task()
333
334        self.mean = (
335            self.original_dataset_constants.MEAN
336        )  # the same with the original dataset
337        self.std = (
338            self.original_dataset_constants.STD
339        )  # the same with the original dataset

Set up the task for the dataset.

class TaskLabelledDataset(typing.Generic[+_T_co]): View Source

342class TaskLabelledDataset(Dataset):
343    r"""The dataset class that labels the a task's dataset with the given task ID. It is used to label the dataset with the task ID for MTL experiment."""
344
345    def __init__(self, dataset: Dataset, task_id: int) -> None:
346        r"""
347        **Args:**
348        - **dataset** (`Dataset`): the dataset to be labelled.
349        - **task_id** (`int`): the task ID to be labelled.
350        """
351        super().__init__()
352
353        self.dataset: Dataset = dataset
354        r"""The original dataset object."""
355        self.task_id: int = task_id
356        r"""The task ID."""
357
358    def __len__(self) -> int:
359        r"""The length of the dataset.
360
361        **Returns:**
362        - **length** (`int`): the length of the dataset.
363        """
364
365        return len(self.dataset)  # the same as the length of the original dataset.
366
367    def __getitem__(self, idx) -> tuple[Any, Any, int]:
368        r"""Get the item from the dataset. Labelled with the task ID.
369
370        **Args:**
371        - **idx** (`int`): the index of the item to be retrieved.
372
373        **Returns:**
374        - **x** (`Any`): the input data.
375        - **y** (`Any`): the target data.
376        - **task_id** (`int`): the task ID.
377        """
378        x, y = self.dataset[idx]
379        return x, y, self.task_id

The dataset class that labels the a task's dataset with the given task ID. It is used to label the dataset with the task ID for MTL experiment.

TaskLabelledDataset(dataset: torch.utils.data.dataset.Dataset, task_id: int) View Source

345    def __init__(self, dataset: Dataset, task_id: int) -> None:
346        r"""
347        **Args:**
348        - **dataset** (`Dataset`): the dataset to be labelled.
349        - **task_id** (`int`): the task ID to be labelled.
350        """
351        super().__init__()
352
353        self.dataset: Dataset = dataset
354        r"""The original dataset object."""
355        self.task_id: int = task_id
356        r"""The task ID."""

Args:

dataset (Dataset): the dataset to be labelled.
task_id (int): the task ID to be labelled.

dataset: torch.utils.data.dataset.Dataset

The original dataset object.

task_id: int

The task ID.

class SEMEION(clarena.stl_datasets.STLDatasetFromRaw): View Source

 22class SEMEION(STLDatasetFromRaw):
 23    r"""SEMEION dataset. The [SEMEION dataset](https://archive.ics.uci.edu/dataset/178/semeion+handwritten+digit) is a collection of handwritten digits. It consists of 1,593 handwritten digit images (10 classes), each 16x16 grayscale image."""
 24
 25    original_dataset_python_class: type[Dataset] = SEMEIONRaw
 26    r"""The original dataset class."""
 27
 28    def __init__(
 29        self,
 30        root: str,
 31        test_percentage: float,
 32        validation_percentage: float,
 33        batch_size: int = 1,
 34        num_workers: int = 0,
 35        custom_transforms: Callable | transforms.Compose | None = None,
 36        repeat_channels: int | None = None,
 37        to_tensor: bool = True,
 38        resize: tuple[int, int] | None = None,
 39    ) -> None:
 40        r"""
 41        **Args:**
 42        - **root** (`str`): the root directory where the original SEMEION data 'SEMEION/' live.
 43        - **test_percentage** (`float`): the percentage to randomly split some data into test data.
 44        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
 45        - **batch_size** (`int`): The batch size in train, val, test dataloader.
 46        - **num_workers** (`int`): the number of workers for dataloaders.
 47        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
 48        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
 49        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
 50        - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
 51        """
 52        super().__init__(
 53            root=root,
 54            batch_size=batch_size,
 55            num_workers=num_workers,
 56            custom_transforms=custom_transforms,
 57            repeat_channels=repeat_channels,
 58            to_tensor=to_tensor,
 59            resize=resize,
 60        )
 61
 62        self.test_percentage: float = test_percentage
 63        r"""The percentage to randomly split some data into test data."""
 64        self.validation_percentage: float = validation_percentage
 65        r"""The percentage to randomly split some training data into validation data."""
 66
 67    def prepare_data(self) -> None:
 68        r"""Download the original SEMEION dataset if haven't."""
 69
 70        SEMEIONRaw(root=self.root, download=True)
 71
 72        pylogger.debug(
 73            "The original SEMEION dataset has been downloaded to %s.", self.root
 74        )
 75
 76    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
 77        """Get the training and validation dataset.
 78
 79        **Returns:**
 80        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset.
 81        """
 82        dataset_all = SEMEIONRaw(
 83            root=self.root,
 84            transform=self.train_and_val_transforms(),
 85            target_transform=self.target_transform(),
 86            download=False,
 87        )
 88
 89        dataset_train_and_val, _ = random_split(
 90            dataset_all,
 91            lengths=[
 92                1 - self.test_percentage,
 93                self.test_percentage,
 94            ],
 95            generator=torch.Generator().manual_seed(
 96                42
 97            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
 98        )
 99
100        return random_split(
101            dataset_train_and_val,
102            lengths=[1 - self.validation_percentage, self.validation_percentage],
103            generator=torch.Generator().manual_seed(
104                42
105            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
106        )
107
108    def test_dataset(self) -> Dataset:
109        r"""Get the test dataset.
110
111        **Returns:**
112        - **test_dataset** (`Dataset`): the test dataset.
113        """
114        dataset_all = SEMEIONRaw(
115            root=self.root,
116            transform=self.train_and_val_transforms(),
117            target_transform=self.target_transform(),
118            download=False,
119        )
120
121        _, dataset_test = random_split(
122            dataset_all,
123            lengths=[1 - self.test_percentage, self.test_percentage],
124            generator=torch.Generator().manual_seed(42),
125        )
126
127        return dataset_test

SEMEION dataset. The SEMEION dataset is a collection of handwritten digits. It consists of 1,593 handwritten digit images (10 classes), each 16x16 grayscale image.

SEMEION( root: str, test_percentage: float, validation_percentage: float, batch_size: int = 1, num_workers: int = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType] = None, repeat_channels: int | None = None, to_tensor: bool = True, resize: tuple[int, int] | None = None) View Source

28    def __init__(
29        self,
30        root: str,
31        test_percentage: float,
32        validation_percentage: float,
33        batch_size: int = 1,
34        num_workers: int = 0,
35        custom_transforms: Callable | transforms.Compose | None = None,
36        repeat_channels: int | None = None,
37        to_tensor: bool = True,
38        resize: tuple[int, int] | None = None,
39    ) -> None:
40        r"""
41        **Args:**
42        - **root** (`str`): the root directory where the original SEMEION data 'SEMEION/' live.
43        - **test_percentage** (`float`): the percentage to randomly split some data into test data.
44        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
45        - **batch_size** (`int`): The batch size in train, val, test dataloader.
46        - **num_workers** (`int`): the number of workers for dataloaders.
47        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
48        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
49        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
50        - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
51        """
52        super().__init__(
53            root=root,
54            batch_size=batch_size,
55            num_workers=num_workers,
56            custom_transforms=custom_transforms,
57            repeat_channels=repeat_channels,
58            to_tensor=to_tensor,
59            resize=resize,
60        )
61
62        self.test_percentage: float = test_percentage
63        r"""The percentage to randomly split some data into test data."""
64        self.validation_percentage: float = validation_percentage
65        r"""The percentage to randomly split some training data into validation data."""

Args:

root (str): the root directory where the original SEMEION data 'SEMEION/' live.
test_percentage (float): the percentage to randomly split some data into test data.
validation_percentage (float): the percentage to randomly split some training data into validation data.
batch_size (int): The batch size in train, val, test dataloader.
num_workers (int): the number of workers for dataloaders.
custom_transforms (transform or transforms.Compose or None): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. ToTensor(), normalize and so on are not included.
repeat_channels (int | None): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
to_tensor (bool): whether to include ToTensor() transform. Default is True.
resize (tuple[int, int] | None or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.

original_dataset_python_class: type[torch.utils.data.dataset.Dataset] = <class 'torchvision.datasets.semeion.SEMEION'>

The original dataset class.

test_percentage: float

The percentage to randomly split some data into test data.

validation_percentage: float

The percentage to randomly split some training data into validation data.

def prepare_data(self) -> None: View Source

67    def prepare_data(self) -> None:
68        r"""Download the original SEMEION dataset if haven't."""
69
70        SEMEIONRaw(root=self.root, download=True)
71
72        pylogger.debug(
73            "The original SEMEION dataset has been downloaded to %s.", self.root
74        )

Download the original SEMEION dataset if haven't.

def train_and_val_dataset( self) -> tuple[torch.utils.data.dataset.Dataset, torch.utils.data.dataset.Dataset]: View Source

 76    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
 77        """Get the training and validation dataset.
 78
 79        **Returns:**
 80        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset.
 81        """
 82        dataset_all = SEMEIONRaw(
 83            root=self.root,
 84            transform=self.train_and_val_transforms(),
 85            target_transform=self.target_transform(),
 86            download=False,
 87        )
 88
 89        dataset_train_and_val, _ = random_split(
 90            dataset_all,
 91            lengths=[
 92                1 - self.test_percentage,
 93                self.test_percentage,
 94            ],
 95            generator=torch.Generator().manual_seed(
 96                42
 97            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
 98        )
 99
100        return random_split(
101            dataset_train_and_val,
102            lengths=[1 - self.validation_percentage, self.validation_percentage],
103            generator=torch.Generator().manual_seed(
104                42
105            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
106        )

Get the training and validation dataset.

Returns:

train_and_val_dataset (tuple[Dataset, Dataset]): the train and validation dataset.

def test_dataset(self) -> torch.utils.data.dataset.Dataset: View Source

108    def test_dataset(self) -> Dataset:
109        r"""Get the test dataset.
110
111        **Returns:**
112        - **test_dataset** (`Dataset`): the test dataset.
113        """
114        dataset_all = SEMEIONRaw(
115            root=self.root,
116            transform=self.train_and_val_transforms(),
117            target_transform=self.target_transform(),
118            download=False,
119        )
120
121        _, dataset_test = random_split(
122            dataset_all,
123            lengths=[1 - self.test_percentage, self.test_percentage],
124            generator=torch.Generator().manual_seed(42),
125        )
126
127        return dataset_test

Get the test dataset.

Returns:

test_dataset (Dataset): the test dataset.