clarena.stl_datasets.facescrub
The submodule in stl_datasets for FaceScrub dataset.
1r""" 2The submodule in `stl_datasets` for FaceScrub dataset. 3""" 4 5__all__ = ["FaceScrub"] 6 7import logging 8from typing import Callable 9 10import torch 11from torch.utils.data import Dataset, random_split 12from torchvision.transforms import transforms 13 14from clarena.stl_datasets.base import STLDatasetFromRaw 15from clarena.stl_datasets.raw import FaceScrub10, FaceScrub20, FaceScrub50, FaceScrub100 16 17# always get logger for built-in logging in each module 18pylogger = logging.getLogger(__name__) 19 20 21class FaceScrub(STLDatasetFromRaw): 22 r"""FaceScrub dataset. The [original FaceScrub dataset](https://vintage.winklerbros.net/facescrub.html) is a collection of human face images. It consists 106,863 images of 530 people (classes), each high resolution color image. 23 24 To make it simple, [this version](https://github.com/nkundiushuti/facescrub_subset) uses subset of the official [Megaface FaceScrub challenge](http://megaface.cs.washington.edu/participate/challenge.html), cropped and resized to 32x32. We have [FaceScrub-10](https://github.com/nkundiushuti/facescrub_subset/blob/master/data/facescrub_10.zip), [FaceScrub-20](https://github.com/nkundiushuti/facescrub_subset/blob/master/data/facescrub_20.zip), [FaceScrub-50](https://github.com/nkundiushuti/facescrub_subset/blob/master/data/facescrub_50.zip), [FaceScrub-100](https://github.com/nkundiushuti/facescrub_subset/blob/master/data/facescrub_100.zip) datasets where the number of classes are 10, 20, 50 and 100 respectively. 25 """ 26 27 def __init__( 28 self, 29 root: str, 30 size: str, 31 validation_percentage: float, 32 batch_size: int = 1, 33 num_workers: int = 0, 34 custom_transforms: Callable | transforms.Compose | None = None, 35 repeat_channels: int | None = None, 36 to_tensor: bool = True, 37 resize: tuple[int, int] | None = None, 38 ) -> None: 39 r""" 40 **Args:** 41 - **root** (`str`): the root directory where the original FaceScrub data 'FaceScrub/' live. 42 - **size** (`str`): the size of the dataset; one of: 43 1. '10': 10 classes (10 people). 44 2. '20': 20 classes (20 people). 45 3. '50': 50 classes (50 people). 46 4. '100': 100 classes (100 people). 47 - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data. 48 - **batch_size** (`int`): The batch size in train, val, test dataloader. 49 - **num_workers** (`int`): the number of workers for dataloaders. 50 - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included. 51 - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer. 52 - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True. 53 - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers. 54 """ 55 56 if size == "10": 57 self.original_dataset_python_class: type[Dataset] = FaceScrub10 58 elif size == "20": 59 self.original_dataset_python_class: type[Dataset] = FaceScrub20 60 elif size == "50": 61 self.original_dataset_python_class: type[Dataset] = FaceScrub50 62 elif size == "100": 63 self.original_dataset_python_class: type[Dataset] = FaceScrub100 64 r"""The original dataset class.""" 65 66 super().__init__( 67 root=root, 68 batch_size=batch_size, 69 num_workers=num_workers, 70 custom_transforms=custom_transforms, 71 repeat_channels=repeat_channels, 72 to_tensor=to_tensor, 73 resize=resize, 74 ) 75 76 self.validation_percentage: float = validation_percentage 77 r"""The percentage to randomly split some training data into validation data.""" 78 79 def prepare_data(self) -> None: 80 r"""Download the original FaceScrub dataset if haven't.""" 81 82 self.original_dataset_python_class(root=self.root, train=True, download=True) 83 self.original_dataset_python_class(root=self.root, train=False, download=True) 84 85 pylogger.debug( 86 "The original FaceScrub dataset has been downloaded to %s.", self.root 87 ) 88 89 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 90 """Get the training and validation dataset. 91 92 **Returns:** 93 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset. 94 """ 95 dataset_train_and_val = self.original_dataset_python_class( 96 root=self.root, 97 train=True, 98 transform=self.train_and_val_transforms(), 99 target_transform=self.target_transform(), 100 download=False, 101 ) 102 103 return random_split( 104 dataset_train_and_val, 105 lengths=[1 - self.validation_percentage, self.validation_percentage], 106 generator=torch.Generator().manual_seed( 107 42 108 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 109 ) 110 111 def test_dataset(self) -> Dataset: 112 r"""Get the test dataset. 113 114 **Returns:** 115 - **test_dataset** (`Dataset`): the test dataset. 116 """ 117 dataset_test = self.original_dataset_python_class( 118 root=self.root, 119 train=False, 120 transform=self.test_transforms(), 121 target_transform=self.target_transform(), 122 download=False, 123 ) 124 125 return dataset_test
class
FaceScrub(clarena.stl_datasets.base.STLDatasetFromRaw):
22class FaceScrub(STLDatasetFromRaw): 23 r"""FaceScrub dataset. The [original FaceScrub dataset](https://vintage.winklerbros.net/facescrub.html) is a collection of human face images. It consists 106,863 images of 530 people (classes), each high resolution color image. 24 25 To make it simple, [this version](https://github.com/nkundiushuti/facescrub_subset) uses subset of the official [Megaface FaceScrub challenge](http://megaface.cs.washington.edu/participate/challenge.html), cropped and resized to 32x32. We have [FaceScrub-10](https://github.com/nkundiushuti/facescrub_subset/blob/master/data/facescrub_10.zip), [FaceScrub-20](https://github.com/nkundiushuti/facescrub_subset/blob/master/data/facescrub_20.zip), [FaceScrub-50](https://github.com/nkundiushuti/facescrub_subset/blob/master/data/facescrub_50.zip), [FaceScrub-100](https://github.com/nkundiushuti/facescrub_subset/blob/master/data/facescrub_100.zip) datasets where the number of classes are 10, 20, 50 and 100 respectively. 26 """ 27 28 def __init__( 29 self, 30 root: str, 31 size: str, 32 validation_percentage: float, 33 batch_size: int = 1, 34 num_workers: int = 0, 35 custom_transforms: Callable | transforms.Compose | None = None, 36 repeat_channels: int | None = None, 37 to_tensor: bool = True, 38 resize: tuple[int, int] | None = None, 39 ) -> None: 40 r""" 41 **Args:** 42 - **root** (`str`): the root directory where the original FaceScrub data 'FaceScrub/' live. 43 - **size** (`str`): the size of the dataset; one of: 44 1. '10': 10 classes (10 people). 45 2. '20': 20 classes (20 people). 46 3. '50': 50 classes (50 people). 47 4. '100': 100 classes (100 people). 48 - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data. 49 - **batch_size** (`int`): The batch size in train, val, test dataloader. 50 - **num_workers** (`int`): the number of workers for dataloaders. 51 - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included. 52 - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer. 53 - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True. 54 - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers. 55 """ 56 57 if size == "10": 58 self.original_dataset_python_class: type[Dataset] = FaceScrub10 59 elif size == "20": 60 self.original_dataset_python_class: type[Dataset] = FaceScrub20 61 elif size == "50": 62 self.original_dataset_python_class: type[Dataset] = FaceScrub50 63 elif size == "100": 64 self.original_dataset_python_class: type[Dataset] = FaceScrub100 65 r"""The original dataset class.""" 66 67 super().__init__( 68 root=root, 69 batch_size=batch_size, 70 num_workers=num_workers, 71 custom_transforms=custom_transforms, 72 repeat_channels=repeat_channels, 73 to_tensor=to_tensor, 74 resize=resize, 75 ) 76 77 self.validation_percentage: float = validation_percentage 78 r"""The percentage to randomly split some training data into validation data.""" 79 80 def prepare_data(self) -> None: 81 r"""Download the original FaceScrub dataset if haven't.""" 82 83 self.original_dataset_python_class(root=self.root, train=True, download=True) 84 self.original_dataset_python_class(root=self.root, train=False, download=True) 85 86 pylogger.debug( 87 "The original FaceScrub dataset has been downloaded to %s.", self.root 88 ) 89 90 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 91 """Get the training and validation dataset. 92 93 **Returns:** 94 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset. 95 """ 96 dataset_train_and_val = self.original_dataset_python_class( 97 root=self.root, 98 train=True, 99 transform=self.train_and_val_transforms(), 100 target_transform=self.target_transform(), 101 download=False, 102 ) 103 104 return random_split( 105 dataset_train_and_val, 106 lengths=[1 - self.validation_percentage, self.validation_percentage], 107 generator=torch.Generator().manual_seed( 108 42 109 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 110 ) 111 112 def test_dataset(self) -> Dataset: 113 r"""Get the test dataset. 114 115 **Returns:** 116 - **test_dataset** (`Dataset`): the test dataset. 117 """ 118 dataset_test = self.original_dataset_python_class( 119 root=self.root, 120 train=False, 121 transform=self.test_transforms(), 122 target_transform=self.target_transform(), 123 download=False, 124 ) 125 126 return dataset_test
FaceScrub dataset. The original FaceScrub dataset is a collection of human face images. It consists 106,863 images of 530 people (classes), each high resolution color image.
To make it simple, this version uses subset of the official Megaface FaceScrub challenge, cropped and resized to 32x32. We have FaceScrub-10, FaceScrub-20, FaceScrub-50, FaceScrub-100 datasets where the number of classes are 10, 20, 50 and 100 respectively.
FaceScrub( root: str, size: str, validation_percentage: float, batch_size: int = 1, num_workers: int = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType] = None, repeat_channels: int | None = None, to_tensor: bool = True, resize: tuple[int, int] | None = None)
28 def __init__( 29 self, 30 root: str, 31 size: str, 32 validation_percentage: float, 33 batch_size: int = 1, 34 num_workers: int = 0, 35 custom_transforms: Callable | transforms.Compose | None = None, 36 repeat_channels: int | None = None, 37 to_tensor: bool = True, 38 resize: tuple[int, int] | None = None, 39 ) -> None: 40 r""" 41 **Args:** 42 - **root** (`str`): the root directory where the original FaceScrub data 'FaceScrub/' live. 43 - **size** (`str`): the size of the dataset; one of: 44 1. '10': 10 classes (10 people). 45 2. '20': 20 classes (20 people). 46 3. '50': 50 classes (50 people). 47 4. '100': 100 classes (100 people). 48 - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data. 49 - **batch_size** (`int`): The batch size in train, val, test dataloader. 50 - **num_workers** (`int`): the number of workers for dataloaders. 51 - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included. 52 - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer. 53 - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True. 54 - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers. 55 """ 56 57 if size == "10": 58 self.original_dataset_python_class: type[Dataset] = FaceScrub10 59 elif size == "20": 60 self.original_dataset_python_class: type[Dataset] = FaceScrub20 61 elif size == "50": 62 self.original_dataset_python_class: type[Dataset] = FaceScrub50 63 elif size == "100": 64 self.original_dataset_python_class: type[Dataset] = FaceScrub100 65 r"""The original dataset class.""" 66 67 super().__init__( 68 root=root, 69 batch_size=batch_size, 70 num_workers=num_workers, 71 custom_transforms=custom_transforms, 72 repeat_channels=repeat_channels, 73 to_tensor=to_tensor, 74 resize=resize, 75 ) 76 77 self.validation_percentage: float = validation_percentage 78 r"""The percentage to randomly split some training data into validation data."""
Args:
- root (
str): the root directory where the original FaceScrub data 'FaceScrub/' live. - size (
str): the size of the dataset; one of:- '10': 10 classes (10 people).
- '20': 20 classes (20 people).
- '50': 50 classes (50 people).
- '100': 100 classes (100 people).
- validation_percentage (
float): the percentage to randomly split some training data into validation data. - batch_size (
int): The batch size in train, val, test dataloader. - num_workers (
int): the number of workers for dataloaders. - custom_transforms (
transformortransforms.ComposeorNone): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform.ToTensor(), normalize and so on are not included. - repeat_channels (
int|None): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer. - to_tensor (
bool): whether to includeToTensor()transform. Default is True. - resize (
tuple[int, int]|Noneor list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
validation_percentage: float
The percentage to randomly split some training data into validation data.
def
prepare_data(self) -> None:
80 def prepare_data(self) -> None: 81 r"""Download the original FaceScrub dataset if haven't.""" 82 83 self.original_dataset_python_class(root=self.root, train=True, download=True) 84 self.original_dataset_python_class(root=self.root, train=False, download=True) 85 86 pylogger.debug( 87 "The original FaceScrub dataset has been downloaded to %s.", self.root 88 )
Download the original FaceScrub dataset if haven't.
def
train_and_val_dataset( self) -> tuple[torch.utils.data.dataset.Dataset, torch.utils.data.dataset.Dataset]:
90 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 91 """Get the training and validation dataset. 92 93 **Returns:** 94 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset. 95 """ 96 dataset_train_and_val = self.original_dataset_python_class( 97 root=self.root, 98 train=True, 99 transform=self.train_and_val_transforms(), 100 target_transform=self.target_transform(), 101 download=False, 102 ) 103 104 return random_split( 105 dataset_train_and_val, 106 lengths=[1 - self.validation_percentage, self.validation_percentage], 107 generator=torch.Generator().manual_seed( 108 42 109 ), # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments 110 )
Get the training and validation dataset.
Returns:
- train_and_val_dataset (
tuple[Dataset, Dataset]): the train and validation dataset.
def
test_dataset(self) -> torch.utils.data.dataset.Dataset:
112 def test_dataset(self) -> Dataset: 113 r"""Get the test dataset. 114 115 **Returns:** 116 - **test_dataset** (`Dataset`): the test dataset. 117 """ 118 dataset_test = self.original_dataset_python_class( 119 root=self.root, 120 train=False, 121 transform=self.test_transforms(), 122 target_transform=self.target_transform(), 123 download=False, 124 ) 125 126 return dataset_test
Get the test dataset.
Returns:
- test_dataset (
Dataset): the test dataset.