clarena.stl_datasets.facescrub

The submodule in stl_datasets for FaceScrub dataset.

  1r"""
  2The submodule in `stl_datasets` for FaceScrub dataset.
  3"""
  4
  5__all__ = ["FaceScrub"]
  6
  7import logging
  8from typing import Callable
  9
 10import torch
 11from torch.utils.data import Dataset, random_split
 12from torchvision.transforms import transforms
 13
 14from clarena.stl_datasets.base import STLDatasetFromRaw
 15from clarena.stl_datasets.raw import FaceScrub10, FaceScrub20, FaceScrub50, FaceScrub100
 16
 17# always get logger for built-in logging in each module
 18pylogger = logging.getLogger(__name__)
 19
 20
 21class FaceScrub(STLDatasetFromRaw):
 22    r"""FaceScrub dataset. The [original FaceScrub dataset](https://vintage.winklerbros.net/facescrub.html) is a collection of human face images. It consists 106,863 images of 530 people (classes), each high resolution color image.
 23
 24    To make it simple, [this version](https://github.com/nkundiushuti/facescrub_subset) uses subset of the official [Megaface FaceScrub challenge](http://megaface.cs.washington.edu/participate/challenge.html), cropped and resized to 32x32. We have [FaceScrub-10](https://github.com/nkundiushuti/facescrub_subset/blob/master/data/facescrub_10.zip), [FaceScrub-20](https://github.com/nkundiushuti/facescrub_subset/blob/master/data/facescrub_20.zip), [FaceScrub-50](https://github.com/nkundiushuti/facescrub_subset/blob/master/data/facescrub_50.zip), [FaceScrub-100](https://github.com/nkundiushuti/facescrub_subset/blob/master/data/facescrub_100.zip) datasets where the number of classes are 10, 20, 50 and 100 respectively.
 25    """
 26
 27    def __init__(
 28        self,
 29        root: str,
 30        size: str,
 31        validation_percentage: float,
 32        batch_size: int = 1,
 33        num_workers: int = 0,
 34        custom_transforms: Callable | transforms.Compose | None = None,
 35        repeat_channels: int | None = None,
 36        to_tensor: bool = True,
 37        resize: tuple[int, int] | None = None,
 38    ) -> None:
 39        r"""
 40        **Args:**
 41        - **root** (`str`): the root directory where the original FaceScrub data 'FaceScrub/' live.
 42        - **size** (`str`): the size of the dataset; one of:
 43            1. '10': 10 classes (10 people).
 44            2. '20': 20 classes (20 people).
 45            3. '50': 50 classes (50 people).
 46            4. '100': 100 classes (100 people).
 47        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
 48        - **batch_size** (`int`): The batch size in train, val, test dataloader.
 49        - **num_workers** (`int`): the number of workers for dataloaders.
 50        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
 51        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
 52        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
 53        - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
 54        """
 55
 56        if size == "10":
 57            self.original_dataset_python_class: type[Dataset] = FaceScrub10
 58        elif size == "20":
 59            self.original_dataset_python_class: type[Dataset] = FaceScrub20
 60        elif size == "50":
 61            self.original_dataset_python_class: type[Dataset] = FaceScrub50
 62        elif size == "100":
 63            self.original_dataset_python_class: type[Dataset] = FaceScrub100
 64            r"""The original dataset class."""
 65
 66        super().__init__(
 67            root=root,
 68            batch_size=batch_size,
 69            num_workers=num_workers,
 70            custom_transforms=custom_transforms,
 71            repeat_channels=repeat_channels,
 72            to_tensor=to_tensor,
 73            resize=resize,
 74        )
 75
 76        self.validation_percentage: float = validation_percentage
 77        r"""The percentage to randomly split some training data into validation data."""
 78
 79    def prepare_data(self) -> None:
 80        r"""Download the original FaceScrub dataset if haven't."""
 81
 82        self.original_dataset_python_class(root=self.root, train=True, download=True)
 83        self.original_dataset_python_class(root=self.root, train=False, download=True)
 84
 85        pylogger.debug(
 86            "The original FaceScrub dataset has been downloaded to %s.", self.root
 87        )
 88
 89    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
 90        """Get the training and validation dataset.
 91
 92        **Returns:**
 93        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset.
 94        """
 95        dataset_train_and_val = self.original_dataset_python_class(
 96            root=self.root,
 97            train=True,
 98            transform=self.train_and_val_transforms(),
 99            target_transform=self.target_transform(),
100            download=False,
101        )
102
103        return random_split(
104            dataset_train_and_val,
105            lengths=[1 - self.validation_percentage, self.validation_percentage],
106            generator=torch.Generator().manual_seed(
107                42
108            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
109        )
110
111    def test_dataset(self) -> Dataset:
112        r"""Get the test dataset.
113
114        **Returns:**
115        - **test_dataset** (`Dataset`): the test dataset.
116        """
117        dataset_test = self.original_dataset_python_class(
118            root=self.root,
119            train=False,
120            transform=self.test_transforms(),
121            target_transform=self.target_transform(),
122            download=False,
123        )
124
125        return dataset_test
class FaceScrub(clarena.stl_datasets.base.STLDatasetFromRaw):
 22class FaceScrub(STLDatasetFromRaw):
 23    r"""FaceScrub dataset. The [original FaceScrub dataset](https://vintage.winklerbros.net/facescrub.html) is a collection of human face images. It consists 106,863 images of 530 people (classes), each high resolution color image.
 24
 25    To make it simple, [this version](https://github.com/nkundiushuti/facescrub_subset) uses subset of the official [Megaface FaceScrub challenge](http://megaface.cs.washington.edu/participate/challenge.html), cropped and resized to 32x32. We have [FaceScrub-10](https://github.com/nkundiushuti/facescrub_subset/blob/master/data/facescrub_10.zip), [FaceScrub-20](https://github.com/nkundiushuti/facescrub_subset/blob/master/data/facescrub_20.zip), [FaceScrub-50](https://github.com/nkundiushuti/facescrub_subset/blob/master/data/facescrub_50.zip), [FaceScrub-100](https://github.com/nkundiushuti/facescrub_subset/blob/master/data/facescrub_100.zip) datasets where the number of classes are 10, 20, 50 and 100 respectively.
 26    """
 27
 28    def __init__(
 29        self,
 30        root: str,
 31        size: str,
 32        validation_percentage: float,
 33        batch_size: int = 1,
 34        num_workers: int = 0,
 35        custom_transforms: Callable | transforms.Compose | None = None,
 36        repeat_channels: int | None = None,
 37        to_tensor: bool = True,
 38        resize: tuple[int, int] | None = None,
 39    ) -> None:
 40        r"""
 41        **Args:**
 42        - **root** (`str`): the root directory where the original FaceScrub data 'FaceScrub/' live.
 43        - **size** (`str`): the size of the dataset; one of:
 44            1. '10': 10 classes (10 people).
 45            2. '20': 20 classes (20 people).
 46            3. '50': 50 classes (50 people).
 47            4. '100': 100 classes (100 people).
 48        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
 49        - **batch_size** (`int`): The batch size in train, val, test dataloader.
 50        - **num_workers** (`int`): the number of workers for dataloaders.
 51        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
 52        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
 53        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
 54        - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
 55        """
 56
 57        if size == "10":
 58            self.original_dataset_python_class: type[Dataset] = FaceScrub10
 59        elif size == "20":
 60            self.original_dataset_python_class: type[Dataset] = FaceScrub20
 61        elif size == "50":
 62            self.original_dataset_python_class: type[Dataset] = FaceScrub50
 63        elif size == "100":
 64            self.original_dataset_python_class: type[Dataset] = FaceScrub100
 65            r"""The original dataset class."""
 66
 67        super().__init__(
 68            root=root,
 69            batch_size=batch_size,
 70            num_workers=num_workers,
 71            custom_transforms=custom_transforms,
 72            repeat_channels=repeat_channels,
 73            to_tensor=to_tensor,
 74            resize=resize,
 75        )
 76
 77        self.validation_percentage: float = validation_percentage
 78        r"""The percentage to randomly split some training data into validation data."""
 79
 80    def prepare_data(self) -> None:
 81        r"""Download the original FaceScrub dataset if haven't."""
 82
 83        self.original_dataset_python_class(root=self.root, train=True, download=True)
 84        self.original_dataset_python_class(root=self.root, train=False, download=True)
 85
 86        pylogger.debug(
 87            "The original FaceScrub dataset has been downloaded to %s.", self.root
 88        )
 89
 90    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
 91        """Get the training and validation dataset.
 92
 93        **Returns:**
 94        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset.
 95        """
 96        dataset_train_and_val = self.original_dataset_python_class(
 97            root=self.root,
 98            train=True,
 99            transform=self.train_and_val_transforms(),
100            target_transform=self.target_transform(),
101            download=False,
102        )
103
104        return random_split(
105            dataset_train_and_val,
106            lengths=[1 - self.validation_percentage, self.validation_percentage],
107            generator=torch.Generator().manual_seed(
108                42
109            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
110        )
111
112    def test_dataset(self) -> Dataset:
113        r"""Get the test dataset.
114
115        **Returns:**
116        - **test_dataset** (`Dataset`): the test dataset.
117        """
118        dataset_test = self.original_dataset_python_class(
119            root=self.root,
120            train=False,
121            transform=self.test_transforms(),
122            target_transform=self.target_transform(),
123            download=False,
124        )
125
126        return dataset_test

FaceScrub dataset. The original FaceScrub dataset is a collection of human face images. It consists 106,863 images of 530 people (classes), each high resolution color image.

To make it simple, this version uses subset of the official Megaface FaceScrub challenge, cropped and resized to 32x32. We have FaceScrub-10, FaceScrub-20, FaceScrub-50, FaceScrub-100 datasets where the number of classes are 10, 20, 50 and 100 respectively.

FaceScrub( root: str, size: str, validation_percentage: float, batch_size: int = 1, num_workers: int = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType] = None, repeat_channels: int | None = None, to_tensor: bool = True, resize: tuple[int, int] | None = None)
28    def __init__(
29        self,
30        root: str,
31        size: str,
32        validation_percentage: float,
33        batch_size: int = 1,
34        num_workers: int = 0,
35        custom_transforms: Callable | transforms.Compose | None = None,
36        repeat_channels: int | None = None,
37        to_tensor: bool = True,
38        resize: tuple[int, int] | None = None,
39    ) -> None:
40        r"""
41        **Args:**
42        - **root** (`str`): the root directory where the original FaceScrub data 'FaceScrub/' live.
43        - **size** (`str`): the size of the dataset; one of:
44            1. '10': 10 classes (10 people).
45            2. '20': 20 classes (20 people).
46            3. '50': 50 classes (50 people).
47            4. '100': 100 classes (100 people).
48        - **validation_percentage** (`float`): the percentage to randomly split some training data into validation data.
49        - **batch_size** (`int`): The batch size in train, val, test dataloader.
50        - **num_workers** (`int`): the number of workers for dataloaders.
51        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
52        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
53        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
54        - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
55        """
56
57        if size == "10":
58            self.original_dataset_python_class: type[Dataset] = FaceScrub10
59        elif size == "20":
60            self.original_dataset_python_class: type[Dataset] = FaceScrub20
61        elif size == "50":
62            self.original_dataset_python_class: type[Dataset] = FaceScrub50
63        elif size == "100":
64            self.original_dataset_python_class: type[Dataset] = FaceScrub100
65            r"""The original dataset class."""
66
67        super().__init__(
68            root=root,
69            batch_size=batch_size,
70            num_workers=num_workers,
71            custom_transforms=custom_transforms,
72            repeat_channels=repeat_channels,
73            to_tensor=to_tensor,
74            resize=resize,
75        )
76
77        self.validation_percentage: float = validation_percentage
78        r"""The percentage to randomly split some training data into validation data."""

Args:

  • root (str): the root directory where the original FaceScrub data 'FaceScrub/' live.
  • size (str): the size of the dataset; one of:
    1. '10': 10 classes (10 people).
    2. '20': 20 classes (20 people).
    3. '50': 50 classes (50 people).
    4. '100': 100 classes (100 people).
  • validation_percentage (float): the percentage to randomly split some training data into validation data.
  • batch_size (int): The batch size in train, val, test dataloader.
  • num_workers (int): the number of workers for dataloaders.
  • custom_transforms (transform or transforms.Compose or None): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. ToTensor(), normalize and so on are not included.
  • repeat_channels (int | None): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
  • to_tensor (bool): whether to include ToTensor() transform. Default is True.
  • resize (tuple[int, int] | None or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
validation_percentage: float

The percentage to randomly split some training data into validation data.

def prepare_data(self) -> None:
80    def prepare_data(self) -> None:
81        r"""Download the original FaceScrub dataset if haven't."""
82
83        self.original_dataset_python_class(root=self.root, train=True, download=True)
84        self.original_dataset_python_class(root=self.root, train=False, download=True)
85
86        pylogger.debug(
87            "The original FaceScrub dataset has been downloaded to %s.", self.root
88        )

Download the original FaceScrub dataset if haven't.

def train_and_val_dataset( self) -> tuple[torch.utils.data.dataset.Dataset, torch.utils.data.dataset.Dataset]:
 90    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
 91        """Get the training and validation dataset.
 92
 93        **Returns:**
 94        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset.
 95        """
 96        dataset_train_and_val = self.original_dataset_python_class(
 97            root=self.root,
 98            train=True,
 99            transform=self.train_and_val_transforms(),
100            target_transform=self.target_transform(),
101            download=False,
102        )
103
104        return random_split(
105            dataset_train_and_val,
106            lengths=[1 - self.validation_percentage, self.validation_percentage],
107            generator=torch.Generator().manual_seed(
108                42
109            ),  # this must be set fixed to make sure the datasets across experiments are the same. Don't handle it to global seed as it might vary across experiments
110        )

Get the training and validation dataset.

Returns:

  • train_and_val_dataset (tuple[Dataset, Dataset]): the train and validation dataset.
def test_dataset(self) -> torch.utils.data.dataset.Dataset:
112    def test_dataset(self) -> Dataset:
113        r"""Get the test dataset.
114
115        **Returns:**
116        - **test_dataset** (`Dataset`): the test dataset.
117        """
118        dataset_test = self.original_dataset_python_class(
119            root=self.root,
120            train=False,
121            transform=self.test_transforms(),
122            target_transform=self.target_transform(),
123            download=False,
124        )
125
126        return dataset_test

Get the test dataset.

Returns:

  • test_dataset (Dataset): the test dataset.