clarena.stl_datasets.caltech101
The submodule in stl_datasets for Caltech 101 dataset.
1r""" 2The submodule in `stl_datasets` for Caltech 101 dataset. 3""" 4 5__all__ = ["Caltech101"] 6 7import logging 8from typing import Callable 9 10import torch 11from torch.utils.data import Dataset, random_split 12from torchvision.datasets import Caltech101 as Caltech101Raw 13from torchvision.transforms import transforms 14 15from clarena.stl_datasets.base import STLDatasetFromRaw 16 17# always get logger for built-in logging in each module 18pylogger = logging.getLogger(__name__) 19 20 21class Caltech101(STLDatasetFromRaw): 22 r"""Caltech 101 dataset. The [Caltech 101 dataset](https://data.caltech.edu/records/mzrjq-6wc02) is a collection of pictures of objects. It consists of 9,146 images of 101 classes, each color image.""" 23 24 original_dataset_python_class: type[Dataset] = Caltech101Raw 25 r"""The original dataset class.""" 26 27 def __init__( 28 self, 29 root: str, 30 test_percentage: float, 31 validation_percentage: float, 32 batch_size: int = 1, 33 num_workers: int = 0, 34 custom_transforms: Callable | transforms.Compose | None = None, 35 repeat_channels: int | None = None, 36 to_tensor: bool = True, 37 resize: tuple[int, int] | None = None, 38 ) -> None: 39 r""" 40 **Args:** 41 - **root** (`str`): the root directory where the original Caltech data 'Caltech/' live. 42 - **test_percentage** (`float`): the percentage to randomly split some data into test data. 43 - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data. 44 - **batch_size** (`int`): The batch size in train, val, test dataloader. 45 - **num_workers** (`int`): the number of workers for dataloaders. 46 - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included. 47 - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer. 48 - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True. 49 - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers. 50 """ 51 super().__init__( 52 root=root, 53 batch_size=batch_size, 54 num_workers=num_workers, 55 custom_transforms=custom_transforms, 56 repeat_channels=repeat_channels, 57 to_tensor=to_tensor, 58 resize=resize, 59 ) 60 61 self.test_percentage: float = test_percentage 62 r"""The percentage to randomly split some data into test data.""" 63 self.validation_percentage: float = validation_percentage 64 r"""The percentage to randomly split some training data into validation data.""" 65 66 def prepare_data(self) -> None: 67 r"""Download the original Caltech 101 dataset if haven't.""" 68 69 Caltech101Raw(root=self.root, download=True) 70 71 pylogger.debug( 72 "The original Caltech 101 dataset has been downloaded to %s.", 73 self.root, 74 ) 75 76 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 77 """Get the training and validation dataset. 78 79 **Returns:** 80 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset. 81 """ 82 dataset_all = Caltech101Raw( 83 root=self.root, 84 transform=self.train_and_val_transforms(), 85 target_transform=self.target_transform(), 86 download=False, 87 ) 88 89 dataset_train_and_val, _ = random_split( 90 dataset_all, 91 lengths=[ 92 1 - self.test_percentage, 93 self.test_percentage, 94 ], 95 generator=torch.Generator().manual_seed( 96 42 97 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 98 ) 99 100 return random_split( 101 dataset_train_and_val, 102 lengths=[1 - self.validation_percentage, self.validation_percentage], 103 generator=torch.Generator().manual_seed( 104 42 105 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 106 ) 107 108 def test_dataset(self) -> Dataset: 109 r"""Get the test dataset. 110 111 **Returns:** 112 - **test_dataset** (`Dataset`): the test dataset. 113 """ 114 dataset_all = Caltech101Raw( 115 root=self.root, 116 transform=self.train_and_val_transforms(), 117 target_transform=self.target_transform(), 118 download=False, 119 ) 120 121 _, dataset_test = random_split( 122 dataset_all, 123 lengths=[1 - self.test_percentage, self.test_percentage], 124 generator=torch.Generator().manual_seed(42), 125 ) 126 127 return dataset_test
class
Caltech101(clarena.stl_datasets.base.STLDatasetFromRaw):
22class Caltech101(STLDatasetFromRaw): 23 r"""Caltech 101 dataset. The [Caltech 101 dataset](https://data.caltech.edu/records/mzrjq-6wc02) is a collection of pictures of objects. It consists of 9,146 images of 101 classes, each color image.""" 24 25 original_dataset_python_class: type[Dataset] = Caltech101Raw 26 r"""The original dataset class.""" 27 28 def __init__( 29 self, 30 root: str, 31 test_percentage: float, 32 validation_percentage: float, 33 batch_size: int = 1, 34 num_workers: int = 0, 35 custom_transforms: Callable | transforms.Compose | None = None, 36 repeat_channels: int | None = None, 37 to_tensor: bool = True, 38 resize: tuple[int, int] | None = None, 39 ) -> None: 40 r""" 41 **Args:** 42 - **root** (`str`): the root directory where the original Caltech data 'Caltech/' live. 43 - **test_percentage** (`float`): the percentage to randomly split some data into test data. 44 - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data. 45 - **batch_size** (`int`): The batch size in train, val, test dataloader. 46 - **num_workers** (`int`): the number of workers for dataloaders. 47 - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included. 48 - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer. 49 - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True. 50 - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers. 51 """ 52 super().__init__( 53 root=root, 54 batch_size=batch_size, 55 num_workers=num_workers, 56 custom_transforms=custom_transforms, 57 repeat_channels=repeat_channels, 58 to_tensor=to_tensor, 59 resize=resize, 60 ) 61 62 self.test_percentage: float = test_percentage 63 r"""The percentage to randomly split some data into test data.""" 64 self.validation_percentage: float = validation_percentage 65 r"""The percentage to randomly split some training data into validation data.""" 66 67 def prepare_data(self) -> None: 68 r"""Download the original Caltech 101 dataset if haven't.""" 69 70 Caltech101Raw(root=self.root, download=True) 71 72 pylogger.debug( 73 "The original Caltech 101 dataset has been downloaded to %s.", 74 self.root, 75 ) 76 77 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 78 """Get the training and validation dataset. 79 80 **Returns:** 81 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset. 82 """ 83 dataset_all = Caltech101Raw( 84 root=self.root, 85 transform=self.train_and_val_transforms(), 86 target_transform=self.target_transform(), 87 download=False, 88 ) 89 90 dataset_train_and_val, _ = random_split( 91 dataset_all, 92 lengths=[ 93 1 - self.test_percentage, 94 self.test_percentage, 95 ], 96 generator=torch.Generator().manual_seed( 97 42 98 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 99 ) 100 101 return random_split( 102 dataset_train_and_val, 103 lengths=[1 - self.validation_percentage, self.validation_percentage], 104 generator=torch.Generator().manual_seed( 105 42 106 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 107 ) 108 109 def test_dataset(self) -> Dataset: 110 r"""Get the test dataset. 111 112 **Returns:** 113 - **test_dataset** (`Dataset`): the test dataset. 114 """ 115 dataset_all = Caltech101Raw( 116 root=self.root, 117 transform=self.train_and_val_transforms(), 118 target_transform=self.target_transform(), 119 download=False, 120 ) 121 122 _, dataset_test = random_split( 123 dataset_all, 124 lengths=[1 - self.test_percentage, self.test_percentage], 125 generator=torch.Generator().manual_seed(42), 126 ) 127 128 return dataset_test
Caltech 101 dataset. The Caltech 101 dataset is a collection of pictures of objects. It consists of 9,146 images of 101 classes, each color image.
Caltech101( root: str, test_percentage: float, validation_percentage: float, batch_size: int = 1, num_workers: int = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType] = None, repeat_channels: int | None = None, to_tensor: bool = True, resize: tuple[int, int] | None = None)
28 def __init__( 29 self, 30 root: str, 31 test_percentage: float, 32 validation_percentage: float, 33 batch_size: int = 1, 34 num_workers: int = 0, 35 custom_transforms: Callable | transforms.Compose | None = None, 36 repeat_channels: int | None = None, 37 to_tensor: bool = True, 38 resize: tuple[int, int] | None = None, 39 ) -> None: 40 r""" 41 **Args:** 42 - **root** (`str`): the root directory where the original Caltech data 'Caltech/' live. 43 - **test_percentage** (`float`): the percentage to randomly split some data into test data. 44 - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data. 45 - **batch_size** (`int`): The batch size in train, val, test dataloader. 46 - **num_workers** (`int`): the number of workers for dataloaders. 47 - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included. 48 - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer. 49 - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True. 50 - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers. 51 """ 52 super().__init__( 53 root=root, 54 batch_size=batch_size, 55 num_workers=num_workers, 56 custom_transforms=custom_transforms, 57 repeat_channels=repeat_channels, 58 to_tensor=to_tensor, 59 resize=resize, 60 ) 61 62 self.test_percentage: float = test_percentage 63 r"""The percentage to randomly split some data into test data.""" 64 self.validation_percentage: float = validation_percentage 65 r"""The percentage to randomly split some training data into validation data."""
Args:
- root (
str): the root directory where the original Caltech data 'Caltech/' live. - test_percentage (
float): the percentage to randomly split some data into test data. - validation_percentage (
float): the percentage to randomly split some training data into validation data. - batch_size (
int): The batch size in train, val, test dataloader. - num_workers (
int): the number of workers for dataloaders. - custom_transforms (
transformortransforms.ComposeorNone): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform.ToTensor(), normalize and so on are not included. - repeat_channels (
int|None): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer. - to_tensor (
bool): whether to includeToTensor()transform. Default is True. - resize (
tuple[int, int]|Noneor list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
original_dataset_python_class: type[torch.utils.data.dataset.Dataset] =
<class 'torchvision.datasets.caltech.Caltech101'>
The original dataset class.
validation_percentage: float
The percentage to randomly split some training data into validation data.
def
prepare_data(self) -> None:
67 def prepare_data(self) -> None: 68 r"""Download the original Caltech 101 dataset if haven't.""" 69 70 Caltech101Raw(root=self.root, download=True) 71 72 pylogger.debug( 73 "The original Caltech 101 dataset has been downloaded to %s.", 74 self.root, 75 )
Download the original Caltech 101 dataset if haven't.
def
train_and_val_dataset( self) -> tuple[torch.utils.data.dataset.Dataset, torch.utils.data.dataset.Dataset]:
77 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 78 """Get the training and validation dataset. 79 80 **Returns:** 81 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset. 82 """ 83 dataset_all = Caltech101Raw( 84 root=self.root, 85 transform=self.train_and_val_transforms(), 86 target_transform=self.target_transform(), 87 download=False, 88 ) 89 90 dataset_train_and_val, _ = random_split( 91 dataset_all, 92 lengths=[ 93 1 - self.test_percentage, 94 self.test_percentage, 95 ], 96 generator=torch.Generator().manual_seed( 97 42 98 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 99 ) 100 101 return random_split( 102 dataset_train_and_val, 103 lengths=[1 - self.validation_percentage, self.validation_percentage], 104 generator=torch.Generator().manual_seed( 105 42 106 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 107 )
Get the training and validation dataset.
Returns:
- train_and_val_dataset (
tuple[Dataset, Dataset]): the train and validation dataset.
def
test_dataset(self) -> torch.utils.data.dataset.Dataset:
109 def test_dataset(self) -> Dataset: 110 r"""Get the test dataset. 111 112 **Returns:** 113 - **test_dataset** (`Dataset`): the test dataset. 114 """ 115 dataset_all = Caltech101Raw( 116 root=self.root, 117 transform=self.train_and_val_transforms(), 118 target_transform=self.target_transform(), 119 download=False, 120 ) 121 122 _, dataset_test = random_split( 123 dataset_all, 124 lengths=[1 - self.test_percentage, self.test_percentage], 125 generator=torch.Generator().manual_seed(42), 126 ) 127 128 return dataset_test
Get the test dataset.
Returns:
- test_dataset (
Dataset): the test dataset.