clarena.stl_datasets.pcam

The submodule in stl_datasets for PCAM dataset.

  1r"""
  2The submodule in `stl_datasets` for PCAM dataset.
  3"""
  4
  5__all__ = ["PCAM"]
  6
  7import logging
  8from typing import Callable
  9
 10from torch.utils.data import Dataset
 11from torchvision.datasets import PCAM as PCAMRaw
 12from torchvision.transforms import transforms
 13
 14from clarena.stl_datasets.base import STLDatasetFromRaw
 15
 16# always get logger for built-in logging in each module
 17pylogger = logging.getLogger(__name__)
 18
 19
 20class PCAM(STLDatasetFromRaw):
 21    r"""PCAM dataset. The [PCAM dataset](https://github.com/basveeling/pcam) is a collection of medical images of breast cancer. It consists of 327,680 images in 2 classes (benign and malignant), each 96x96 color image."""
 22
 23    original_dataset_python_class: type[Dataset] = PCAMRaw
 24    r"""The original dataset class."""
 25
 26    def __init__(
 27        self,
 28        root: str,
 29        batch_size: int = 1,
 30        num_workers: int = 0,
 31        custom_transforms: Callable | transforms.Compose | None = None,
 32        repeat_channels: int | None = None,
 33        to_tensor: bool = True,
 34        resize: tuple[int, int] | None = None,
 35    ) -> None:
 36        r"""
 37        **Args:**
 38        - **root** (`str`): the root directory where the original PCAM data 'PCAM/' live.
 39        - **batch_size** (`int`): The batch size in train, val, test dataloader.
 40        - **num_workers** (`int`): the number of workers for dataloaders.
 41        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
 42        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
 43        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
 44        - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
 45        """
 46        super().__init__(
 47            root=root,
 48            batch_size=batch_size,
 49            num_workers=num_workers,
 50            custom_transforms=custom_transforms,
 51            repeat_channels=repeat_channels,
 52            to_tensor=to_tensor,
 53            resize=resize,
 54        )
 55
 56    def prepare_data(self) -> None:
 57        r"""Download the original PCAM dataset if haven't."""
 58
 59        PCAMRaw(root=self.root, split="train", download=True)
 60        PCAMRaw(root=self.root, split="val", download=True)
 61        PCAMRaw(root=self.root, split="test", download=True)
 62
 63        pylogger.debug(
 64            "The original PCAM dataset has been downloaded to %s.", self.root
 65        )
 66
 67    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
 68        """Get the training and validation dataset.
 69
 70        **Returns:**
 71        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset.
 72        """
 73        dataset_train = PCAMRaw(
 74            root=self.root,
 75            split="train",
 76            transform=self.train_and_val_transforms(),
 77            target_transform=self.target_transform(),
 78            download=False,
 79        )
 80
 81        dataset_val = PCAMRaw(
 82            root=self.root,
 83            split="val",
 84            transform=self.train_and_val_transforms(),
 85            download=False,
 86        )
 87
 88        return dataset_train, dataset_val
 89
 90    def test_dataset(self) -> Dataset:
 91        r"""Get the test dataset.
 92
 93        **Returns:**
 94        - **test_dataset** (`Dataset`): the test dataset.
 95        """
 96        dataset_test = PCAMRaw(
 97            root=self.root,
 98            split="test",
 99            transform=self.test_transforms(),
100            target_transform=self.target_transform(),
101            download=False,
102        )
103
104        return dataset_test
class PCAM(clarena.stl_datasets.base.STLDatasetFromRaw):
 21class PCAM(STLDatasetFromRaw):
 22    r"""PCAM dataset. The [PCAM dataset](https://github.com/basveeling/pcam) is a collection of medical images of breast cancer. It consists of 327,680 images in 2 classes (benign and malignant), each 96x96 color image."""
 23
 24    original_dataset_python_class: type[Dataset] = PCAMRaw
 25    r"""The original dataset class."""
 26
 27    def __init__(
 28        self,
 29        root: str,
 30        batch_size: int = 1,
 31        num_workers: int = 0,
 32        custom_transforms: Callable | transforms.Compose | None = None,
 33        repeat_channels: int | None = None,
 34        to_tensor: bool = True,
 35        resize: tuple[int, int] | None = None,
 36    ) -> None:
 37        r"""
 38        **Args:**
 39        - **root** (`str`): the root directory where the original PCAM data 'PCAM/' live.
 40        - **batch_size** (`int`): The batch size in train, val, test dataloader.
 41        - **num_workers** (`int`): the number of workers for dataloaders.
 42        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
 43        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
 44        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
 45        - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
 46        """
 47        super().__init__(
 48            root=root,
 49            batch_size=batch_size,
 50            num_workers=num_workers,
 51            custom_transforms=custom_transforms,
 52            repeat_channels=repeat_channels,
 53            to_tensor=to_tensor,
 54            resize=resize,
 55        )
 56
 57    def prepare_data(self) -> None:
 58        r"""Download the original PCAM dataset if haven't."""
 59
 60        PCAMRaw(root=self.root, split="train", download=True)
 61        PCAMRaw(root=self.root, split="val", download=True)
 62        PCAMRaw(root=self.root, split="test", download=True)
 63
 64        pylogger.debug(
 65            "The original PCAM dataset has been downloaded to %s.", self.root
 66        )
 67
 68    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
 69        """Get the training and validation dataset.
 70
 71        **Returns:**
 72        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset.
 73        """
 74        dataset_train = PCAMRaw(
 75            root=self.root,
 76            split="train",
 77            transform=self.train_and_val_transforms(),
 78            target_transform=self.target_transform(),
 79            download=False,
 80        )
 81
 82        dataset_val = PCAMRaw(
 83            root=self.root,
 84            split="val",
 85            transform=self.train_and_val_transforms(),
 86            download=False,
 87        )
 88
 89        return dataset_train, dataset_val
 90
 91    def test_dataset(self) -> Dataset:
 92        r"""Get the test dataset.
 93
 94        **Returns:**
 95        - **test_dataset** (`Dataset`): the test dataset.
 96        """
 97        dataset_test = PCAMRaw(
 98            root=self.root,
 99            split="test",
100            transform=self.test_transforms(),
101            target_transform=self.target_transform(),
102            download=False,
103        )
104
105        return dataset_test

PCAM dataset. The PCAM dataset is a collection of medical images of breast cancer. It consists of 327,680 images in 2 classes (benign and malignant), each 96x96 color image.

PCAM( root: str, batch_size: int = 1, num_workers: int = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType] = None, repeat_channels: int | None = None, to_tensor: bool = True, resize: tuple[int, int] | None = None)
27    def __init__(
28        self,
29        root: str,
30        batch_size: int = 1,
31        num_workers: int = 0,
32        custom_transforms: Callable | transforms.Compose | None = None,
33        repeat_channels: int | None = None,
34        to_tensor: bool = True,
35        resize: tuple[int, int] | None = None,
36    ) -> None:
37        r"""
38        **Args:**
39        - **root** (`str`): the root directory where the original PCAM data 'PCAM/' live.
40        - **batch_size** (`int`): The batch size in train, val, test dataloader.
41        - **num_workers** (`int`): the number of workers for dataloaders.
42        - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included.
43        - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
44        - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True.
45        - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
46        """
47        super().__init__(
48            root=root,
49            batch_size=batch_size,
50            num_workers=num_workers,
51            custom_transforms=custom_transforms,
52            repeat_channels=repeat_channels,
53            to_tensor=to_tensor,
54            resize=resize,
55        )

Args:

  • root (str): the root directory where the original PCAM data 'PCAM/' live.
  • batch_size (int): The batch size in train, val, test dataloader.
  • num_workers (int): the number of workers for dataloaders.
  • custom_transforms (transform or transforms.Compose or None): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. ToTensor(), normalize and so on are not included.
  • repeat_channels (int | None): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer.
  • to_tensor (bool): whether to include ToTensor() transform. Default is True.
  • resize (tuple[int, int] | None or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
original_dataset_python_class: type[torch.utils.data.dataset.Dataset] = <class 'torchvision.datasets.pcam.PCAM'>

The original dataset class.

def prepare_data(self) -> None:
57    def prepare_data(self) -> None:
58        r"""Download the original PCAM dataset if haven't."""
59
60        PCAMRaw(root=self.root, split="train", download=True)
61        PCAMRaw(root=self.root, split="val", download=True)
62        PCAMRaw(root=self.root, split="test", download=True)
63
64        pylogger.debug(
65            "The original PCAM dataset has been downloaded to %s.", self.root
66        )

Download the original PCAM dataset if haven't.

def train_and_val_dataset( self) -> tuple[torch.utils.data.dataset.Dataset, torch.utils.data.dataset.Dataset]:
68    def train_and_val_dataset(self) -> tuple[Dataset, Dataset]:
69        """Get the training and validation dataset.
70
71        **Returns:**
72        - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset.
73        """
74        dataset_train = PCAMRaw(
75            root=self.root,
76            split="train",
77            transform=self.train_and_val_transforms(),
78            target_transform=self.target_transform(),
79            download=False,
80        )
81
82        dataset_val = PCAMRaw(
83            root=self.root,
84            split="val",
85            transform=self.train_and_val_transforms(),
86            download=False,
87        )
88
89        return dataset_train, dataset_val

Get the training and validation dataset.

Returns:

  • train_and_val_dataset (tuple[Dataset, Dataset]): the train and validation dataset.
def test_dataset(self) -> torch.utils.data.dataset.Dataset:
 91    def test_dataset(self) -> Dataset:
 92        r"""Get the test dataset.
 93
 94        **Returns:**
 95        - **test_dataset** (`Dataset`): the test dataset.
 96        """
 97        dataset_test = PCAMRaw(
 98            root=self.root,
 99            split="test",
100            transform=self.test_transforms(),
101            target_transform=self.target_transform(),
102            download=False,
103        )
104
105        return dataset_test

Get the test dataset.

Returns:

  • test_dataset (Dataset): the test dataset.