clarena.stl_datasets.pcam
The submodule in stl_datasets for PCAM dataset.
1r""" 2The submodule in `stl_datasets` for PCAM dataset. 3""" 4 5__all__ = ["PCAM"] 6 7import logging 8from typing import Callable 9 10from torch.utils.data import Dataset 11from torchvision.datasets import PCAM as PCAMRaw 12from torchvision.transforms import transforms 13 14from clarena.stl_datasets.base import STLDatasetFromRaw 15 16# always get logger for built-in logging in each module 17pylogger = logging.getLogger(__name__) 18 19 20class PCAM(STLDatasetFromRaw): 21 r"""PCAM dataset. The [PCAM dataset](https://github.com/basveeling/pcam) is a collection of medical images of breast cancer. It consists of 327,680 images in 2 classes (benign and malignant), each 96x96 color image.""" 22 23 original_dataset_python_class: type[Dataset] = PCAMRaw 24 r"""The original dataset class.""" 25 26 def __init__( 27 self, 28 root: str, 29 batch_size: int = 1, 30 num_workers: int = 0, 31 custom_transforms: Callable | transforms.Compose | None = None, 32 repeat_channels: int | None = None, 33 to_tensor: bool = True, 34 resize: tuple[int, int] | None = None, 35 ) -> None: 36 r""" 37 **Args:** 38 - **root** (`str`): the root directory where the original PCAM data 'PCAM/' live. 39 - **batch_size** (`int`): The batch size in train, val, test dataloader. 40 - **num_workers** (`int`): the number of workers for dataloaders. 41 - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included. 42 - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer. 43 - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True. 44 - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers. 45 """ 46 super().__init__( 47 root=root, 48 batch_size=batch_size, 49 num_workers=num_workers, 50 custom_transforms=custom_transforms, 51 repeat_channels=repeat_channels, 52 to_tensor=to_tensor, 53 resize=resize, 54 ) 55 56 def prepare_data(self) -> None: 57 r"""Download the original PCAM dataset if haven't.""" 58 59 PCAMRaw(root=self.root, split="train", download=True) 60 PCAMRaw(root=self.root, split="val", download=True) 61 PCAMRaw(root=self.root, split="test", download=True) 62 63 pylogger.debug( 64 "The original PCAM dataset has been downloaded to %s.", self.root 65 ) 66 67 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 68 """Get the training and validation dataset. 69 70 **Returns:** 71 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset. 72 """ 73 dataset_train = PCAMRaw( 74 root=self.root, 75 split="train", 76 transform=self.train_and_val_transforms(), 77 target_transform=self.target_transform(), 78 download=False, 79 ) 80 81 dataset_val = PCAMRaw( 82 root=self.root, 83 split="val", 84 transform=self.train_and_val_transforms(), 85 download=False, 86 ) 87 88 return dataset_train, dataset_val 89 90 def test_dataset(self) -> Dataset: 91 r"""Get the test dataset. 92 93 **Returns:** 94 - **test_dataset** (`Dataset`): the test dataset. 95 """ 96 dataset_test = PCAMRaw( 97 root=self.root, 98 split="test", 99 transform=self.test_transforms(), 100 target_transform=self.target_transform(), 101 download=False, 102 ) 103 104 return dataset_test
class
PCAM(clarena.stl_datasets.base.STLDatasetFromRaw):
21class PCAM(STLDatasetFromRaw): 22 r"""PCAM dataset. The [PCAM dataset](https://github.com/basveeling/pcam) is a collection of medical images of breast cancer. It consists of 327,680 images in 2 classes (benign and malignant), each 96x96 color image.""" 23 24 original_dataset_python_class: type[Dataset] = PCAMRaw 25 r"""The original dataset class.""" 26 27 def __init__( 28 self, 29 root: str, 30 batch_size: int = 1, 31 num_workers: int = 0, 32 custom_transforms: Callable | transforms.Compose | None = None, 33 repeat_channels: int | None = None, 34 to_tensor: bool = True, 35 resize: tuple[int, int] | None = None, 36 ) -> None: 37 r""" 38 **Args:** 39 - **root** (`str`): the root directory where the original PCAM data 'PCAM/' live. 40 - **batch_size** (`int`): The batch size in train, val, test dataloader. 41 - **num_workers** (`int`): the number of workers for dataloaders. 42 - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included. 43 - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer. 44 - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True. 45 - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers. 46 """ 47 super().__init__( 48 root=root, 49 batch_size=batch_size, 50 num_workers=num_workers, 51 custom_transforms=custom_transforms, 52 repeat_channels=repeat_channels, 53 to_tensor=to_tensor, 54 resize=resize, 55 ) 56 57 def prepare_data(self) -> None: 58 r"""Download the original PCAM dataset if haven't.""" 59 60 PCAMRaw(root=self.root, split="train", download=True) 61 PCAMRaw(root=self.root, split="val", download=True) 62 PCAMRaw(root=self.root, split="test", download=True) 63 64 pylogger.debug( 65 "The original PCAM dataset has been downloaded to %s.", self.root 66 ) 67 68 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 69 """Get the training and validation dataset. 70 71 **Returns:** 72 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset. 73 """ 74 dataset_train = PCAMRaw( 75 root=self.root, 76 split="train", 77 transform=self.train_and_val_transforms(), 78 target_transform=self.target_transform(), 79 download=False, 80 ) 81 82 dataset_val = PCAMRaw( 83 root=self.root, 84 split="val", 85 transform=self.train_and_val_transforms(), 86 download=False, 87 ) 88 89 return dataset_train, dataset_val 90 91 def test_dataset(self) -> Dataset: 92 r"""Get the test dataset. 93 94 **Returns:** 95 - **test_dataset** (`Dataset`): the test dataset. 96 """ 97 dataset_test = PCAMRaw( 98 root=self.root, 99 split="test", 100 transform=self.test_transforms(), 101 target_transform=self.target_transform(), 102 download=False, 103 ) 104 105 return dataset_test
PCAM dataset. The PCAM dataset is a collection of medical images of breast cancer. It consists of 327,680 images in 2 classes (benign and malignant), each 96x96 color image.
PCAM( root: str, batch_size: int = 1, num_workers: int = 0, custom_transforms: Union[Callable, torchvision.transforms.transforms.Compose, NoneType] = None, repeat_channels: int | None = None, to_tensor: bool = True, resize: tuple[int, int] | None = None)
27 def __init__( 28 self, 29 root: str, 30 batch_size: int = 1, 31 num_workers: int = 0, 32 custom_transforms: Callable | transforms.Compose | None = None, 33 repeat_channels: int | None = None, 34 to_tensor: bool = True, 35 resize: tuple[int, int] | None = None, 36 ) -> None: 37 r""" 38 **Args:** 39 - **root** (`str`): the root directory where the original PCAM data 'PCAM/' live. 40 - **batch_size** (`int`): The batch size in train, val, test dataloader. 41 - **num_workers** (`int`): the number of workers for dataloaders. 42 - **custom_transforms** (`transform` or `transforms.Compose` or `None`): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform. `ToTensor()`, normalize and so on are not included. 43 - **repeat_channels** (`int` | `None`): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer. 44 - **to_tensor** (`bool`): whether to include `ToTensor()` transform. Default is True. 45 - **resize** (`tuple[int, int]` | `None` or list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers. 46 """ 47 super().__init__( 48 root=root, 49 batch_size=batch_size, 50 num_workers=num_workers, 51 custom_transforms=custom_transforms, 52 repeat_channels=repeat_channels, 53 to_tensor=to_tensor, 54 resize=resize, 55 )
Args:
- root (
str): the root directory where the original PCAM data 'PCAM/' live. - batch_size (
int): The batch size in train, val, test dataloader. - num_workers (
int): the number of workers for dataloaders. - custom_transforms (
transformortransforms.ComposeorNone): the custom transforms to apply to ONLY TRAIN dataset. Can be a single transform, composed transforms or no transform.ToTensor(), normalize and so on are not included. - repeat_channels (
int|None): the number of channels to repeat. Default is None, which means no repeat. If not None, it should be an integer. - to_tensor (
bool): whether to includeToTensor()transform. Default is True. - resize (
tuple[int, int]|Noneor list of them): the size to resize the images to. Default is None, which means no resize. If not None, it should be a tuple of two integers.
original_dataset_python_class: type[torch.utils.data.dataset.Dataset] =
<class 'torchvision.datasets.pcam.PCAM'>
The original dataset class.
def
prepare_data(self) -> None:
57 def prepare_data(self) -> None: 58 r"""Download the original PCAM dataset if haven't.""" 59 60 PCAMRaw(root=self.root, split="train", download=True) 61 PCAMRaw(root=self.root, split="val", download=True) 62 PCAMRaw(root=self.root, split="test", download=True) 63 64 pylogger.debug( 65 "The original PCAM dataset has been downloaded to %s.", self.root 66 )
Download the original PCAM dataset if haven't.
def
train_and_val_dataset( self) -> tuple[torch.utils.data.dataset.Dataset, torch.utils.data.dataset.Dataset]:
68 def train_and_val_dataset(self) -> tuple[Dataset, Dataset]: 69 """Get the training and validation dataset. 70 71 **Returns:** 72 - **train_and_val_dataset** (`tuple[Dataset, Dataset]`): the train and validation dataset. 73 """ 74 dataset_train = PCAMRaw( 75 root=self.root, 76 split="train", 77 transform=self.train_and_val_transforms(), 78 target_transform=self.target_transform(), 79 download=False, 80 ) 81 82 dataset_val = PCAMRaw( 83 root=self.root, 84 split="val", 85 transform=self.train_and_val_transforms(), 86 download=False, 87 ) 88 89 return dataset_train, dataset_val
Get the training and validation dataset.
Returns:
- train_and_val_dataset (
tuple[Dataset, Dataset]): the train and validation dataset.
def
test_dataset(self) -> torch.utils.data.dataset.Dataset:
91 def test_dataset(self) -> Dataset: 92 r"""Get the test dataset. 93 94 **Returns:** 95 - **test_dataset** (`Dataset`): the test dataset. 96 """ 97 dataset_test = PCAMRaw( 98 root=self.root, 99 split="test", 100 transform=self.test_transforms(), 101 target_transform=self.target_transform(), 102 download=False, 103 ) 104 105 return dataset_test
Get the test dataset.
Returns:
- test_dataset (
Dataset): the test dataset.