clarena.backbones.resnet

The submodule in backbones for ResNet backbone networks. It includes the basic ResNet and continual learning ResNet variants.

   1r"""
   2The submodule in `backbones` for ResNet backbone networks. It includes the basic ResNet and continual learning ResNet variants.
   3"""
   4
   5__all__ = [
   6    "ResNetBlockSmall",
   7    "ResNetBlockLarge",
   8    "ResNetBase",
   9    "ResNet18",
  10    "ResNet34",
  11    "ResNet50",
  12    "ResNet101",
  13    "ResNet152",
  14    "CLResNet18",
  15    "CLResNet34",
  16    "CLResNet50",
  17    "CLResNet101",
  18    "CLResNet152",
  19]
  20
  21import logging
  22
  23import torchvision
  24from torch import Tensor, nn
  25
  26from clarena.backbones import Backbone, CLBackbone
  27from clarena.backbones.constants import RESNET18_STATE_DICT_MAPPING
  28
  29# always get logger for built-in logging in each module
  30pylogger = logging.getLogger(__name__)
  31
  32
  33class ResNetBlockSmall(Backbone):
  34    r"""The smaller building block for ResNet-18/34.
  35
  36    It consists of 2 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (left) in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html).
  37    """
  38
  39    def __init__(
  40        self,
  41        outer_layer_name: str,
  42        block_idx: int,
  43        preceding_output_channels: int,
  44        input_channels: int,
  45        overall_stride: int,
  46        activation_layer: nn.Module | None = nn.ReLU,
  47        batch_normalization: bool = True,
  48        bias: bool = False,
  49        output_dim: int | None = None,
  50        **kwargs,
  51    ) -> None:
  52        r"""Construct and initialize the smaller building block.
  53
  54        **Args:**
  55        - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
  56        - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
  57        - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block.
  58        - **input_channels** (`int`): the number of channels of input of this building block.
  59        - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (last) convolutional layer where the 1st convolutional layer remain stride of 1.
  60        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
  61        - **batch_normalization** (`bool`): whether to use batch normalization after convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
  62        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
  63        - **output_dim** (`int` | `None`): placeholder to be compatible with Backbone API. Not used in building blocks.
  64        - **kwargs**: Reserved for multiple inheritance.
  65        """
  66        super().__init__(output_dim=None, **kwargs)
  67
  68        self.batch_normalization: bool = batch_normalization
  69        r"""Whether to use batch normalization after convolutional layers."""
  70        self.activation: bool = activation_layer is not None
  71        r"""Whether to use activation function after convolutional layers."""
  72
  73        self.full_1st_layer_name = f"{outer_layer_name}/{block_idx}/conv1"
  74        r"""Format and store full name of the 1st weighted convolutional layer. """
  75        self.full_2nd_layer_name = f"{outer_layer_name}/{block_idx}/conv2"
  76        r"""Format and store full name of the 2nd weighted convolutional layer. """
  77
  78        # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc)
  79        layer_input_channels = preceding_output_channels  # the input channels of the 1st convolutional layer, which receive the output channels of the preceding module
  80        layer_output_channels = (
  81            input_channels  # the output channels of the 1st convolutional layer
  82        )
  83        self.conv1 = nn.Conv2d(
  84            in_channels=layer_input_channels,
  85            out_channels=layer_output_channels,
  86            kernel_size=3,
  87            stride=1,
  88            padding=1,
  89            bias=bias,
  90        )  # construct the 1st weight convolutional layer of the smaller building block. Overall stride is not performed here
  91        r"""The 1st weight convolutional layer of the smaller building block. """
  92        self.weighted_layer_names.append(
  93            self.full_1st_layer_name
  94        )  # update the weighted layer names
  95        if self.batch_normalization:
  96            self.conv_bn1 = nn.BatchNorm2d(
  97                num_features=layer_output_channels
  98            )  # construct the batch normalization layer
  99            r"""The batch normalization (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """
 100        if self.activation:
 101            self.conv_activation1 = activation_layer()  # construct the activation layer
 102            r"""The activation layer after the 1st weighted convolutional layer. """
 103
 104        # construct the 2nd weighted convolutional layer and attached layers (batchnorm, activation, etc)
 105        layer_input_channels = input_channels  # the input channels of the 2nd convolutional layer, which is `input_channels`, the same as the output channels of the 1st convolutional layer
 106        layer_output_channels = (
 107            input_channels * 1
 108        )  # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion)
 109        self.conv2 = nn.Conv2d(
 110            in_channels=layer_input_channels,
 111            out_channels=layer_output_channels,
 112            kernel_size=3,
 113            stride=overall_stride,
 114            padding=1,
 115            bias=bias,
 116        )  # construct the 2nd weight convolutional layer of the smaller building block. Overall stride is performed here
 117        r"""The 2nd weight convolutional layer of the smaller building block. """
 118        self.weighted_layer_names.append(
 119            self.full_2nd_layer_name
 120        )  # update the weighted layer names
 121        if batch_normalization:
 122            self.conv_bn2 = nn.BatchNorm2d(
 123                num_features=layer_output_channels
 124            )  # construct the batch normalization layer
 125            r"""The batch normalization (`nn.BatchNorm2d`) layer after the 2nd weighted convolutional layer. """
 126        if self.activation:
 127            self.conv_activation2 = activation_layer()  # construct the activation layer
 128            r"""The activation layer after the 2nd weighted convolutional layer. """
 129
 130        self.identity_downsample: nn.Module = (
 131            nn.Conv2d(
 132                in_channels=preceding_output_channels,
 133                out_channels=input_channels,
 134                kernel_size=1,
 135                stride=overall_stride,
 136                bias=False,
 137            )
 138            if preceding_output_channels != input_channels or overall_stride != 1
 139            else None
 140        )  # construct the identity downsample function
 141        r"""The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists. """
 142
 143    def forward(self, input: Tensor) -> tuple[Tensor, dict[str, Tensor]]:
 144        r"""The forward pass for data.
 145
 146        **Args:**
 147        - **input** (`Tensor`): the input feature maps.
 148
 149        **Returns:**
 150        - **output_feature** (`Tensor`): the output feature maps.
 151        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
 152        """
 153        activations = {}
 154
 155        identity = (
 156            self.identity_downsample(input)
 157            if self.identity_downsample is not None
 158            else input
 159        )  # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's
 160
 161        x = input
 162        x = self.conv1(x)
 163        if self.batch_normalization:
 164            x = self.conv_bn1(
 165                x
 166            )  # batch normalization can be before or after activation. We put it before activation here
 167        if self.activation:
 168            x = self.conv_activation1(x)
 169        activations[self.full_1st_layer_name] = x  # store the hidden feature
 170
 171        x = self.conv2(x)
 172        if self.batch_normalization:
 173            x = self.conv_bn2(
 174                x
 175            )  # batch normalization can be before or after activation. We put it before activation here
 176
 177        x = x + identity
 178        if self.activation:
 179            x = self.conv_activation2(x)  # activation after the shortcut connection
 180        activations[self.full_2nd_layer_name] = x  # store the hidden feature
 181
 182        output_feature = x
 183
 184        return output_feature, activations
 185
 186
 187class ResNetBlockLarge(Backbone):
 188    r"""The larger building block for ResNet-50/101/152. It is referred to "bottleneck" building block in the paper.
 189
 190    It consists of 3 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (right) in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html).
 191    """
 192
 193    def __init__(
 194        self,
 195        outer_layer_name: str,
 196        block_idx: int,
 197        preceding_output_channels: int,
 198        input_channels: int,
 199        overall_stride: int,
 200        activation_layer: nn.Module | None = nn.ReLU,
 201        batch_normalization: bool = True,
 202        bias: bool = False,
 203        output_dim: int | None = None,
 204        **kwargs,
 205    ) -> None:
 206        r"""Construct and initialize the larger building block.
 207
 208        **Args:**
 209        - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
 210        - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
 211        - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block.
 212        - **input_channels** (`int`): the number of channels of input of this building block.
 213        - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (middle) convolutional layer where 1st and 3rd convolutional layers remain stride of 1.
 214        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 215        - **batch_normalization** (`bool`): whether to use batch normalization after convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 216        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
 217        - **output_dim** (`int` | `None`): placeholder to be compatible with Backbone API. Not used in building blocks.
 218        - **kwargs**: Reserved for multiple inheritance.
 219        """
 220        super().__init__(output_dim=None, **kwargs)
 221
 222        self.batch_normalization: bool = batch_normalization
 223        r"""Whether to use batch normalization after convolutional layers."""
 224        self.activation: bool = activation_layer is not None
 225        r"""Whether to use activation function after convolutional layers."""
 226
 227        self.full_1st_layer_name = f"{outer_layer_name}/{block_idx}/conv1"
 228        r"""Format and store full name of the 1st weighted convolutional layer. """
 229        self.full_2nd_layer_name = f"{outer_layer_name}/{block_idx}/conv2"
 230        r"""Format and store full name of the 2nd weighted convolutional layer. """
 231        self.full_3rd_layer_name = f"{outer_layer_name}/{block_idx}/conv3"
 232        r"""Format and store full name of the 3rd weighted convolutional layer. """
 233
 234        # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc)
 235        layer_input_channels = preceding_output_channels  # the input channels of the 1st convolutional layer, which receive the output channels of the preceding module
 236        layer_output_channels = (
 237            input_channels  # the output channels of the 1st convolutional layer
 238        )
 239        self.conv1 = nn.Conv2d(
 240            in_channels=layer_input_channels,
 241            out_channels=layer_output_channels,
 242            kernel_size=1,
 243            stride=1,
 244            padding=0,
 245            bias=bias,
 246        )  # construct the 1st weight convolutional layer of the larger building block. Overall stride is not performed here
 247        r"""The 1st weight convolutional layer of the larger building block. """
 248        self.weighted_layer_names.append(
 249            self.full_1st_layer_name
 250        )  # update the weighted layer names
 251        if self.batch_normalization:
 252            self.conv_bn1 = nn.BatchNorm2d(
 253                num_features=layer_output_channels
 254            )  # construct the batch normalization layer
 255            r"""The batch normalization (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """
 256        if self.activation:
 257            self.conv_activation1 = activation_layer()  # construct the activation layer
 258            r"""The activation layer after the 1st weighted convolutional layer. """
 259
 260        # construct the 2nd weighted convolutional layer and attached layers (batchnorm, activation, etc)
 261        layer_input_channels = input_channels  # the input channels of the 2nd convolutional layer, which is `input_channels`, the same as the output channels of the 1st convolutional layer
 262        layer_output_channels = (
 263            input_channels
 264            * 1  # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion)
 265        )
 266        self.conv2 = nn.Conv2d(
 267            in_channels=layer_input_channels,
 268            out_channels=layer_output_channels,
 269            kernel_size=3,
 270            stride=overall_stride,
 271            padding=1,
 272            bias=bias,
 273        )  # construct the 2nd weight convolutional layer of the larger building block. Overall stride is performed here
 274        r"""The 2nd weight convolutional layer of the larger building block. """
 275        self.weighted_layer_names.append(
 276            self.full_2nd_layer_name
 277        )  # update the weighted layer names
 278        if self.batch_normalization:
 279            self.conv_bn2 = nn.BatchNorm2d(
 280                num_features=layer_output_channels
 281            )  # construct the batch normalization layer
 282            r"""The batch normalization (`nn.BatchNorm2d`) layer after the 2nd weighted convolutional layer. """
 283        if self.activation:
 284            self.conv_activation2 = activation_layer()  # construct the activation layer
 285            r"""The activation layer after the 2nd weighted convolutional layer. """
 286
 287        # construct the 3rd weighted convolutional layer and attached layers (batchnorm, activation, etc)
 288        layer_input_channels = (
 289            input_channels * 1
 290        )  # the input channels of the 3rd (final) convolutional layer, same as output of 2nd layer
 291        layer_output_channels = (
 292            input_channels * 4  # the output channels of the 3rd layer (4x expansion)
 293        )
 294        self.conv3 = nn.Conv2d(
 295            in_channels=layer_input_channels,
 296            out_channels=layer_output_channels,
 297            kernel_size=1,
 298            stride=1,
 299            padding=0,
 300            bias=bias,
 301        )  # construct the 3rd weight convolutional layer of the larger building block. Overall stride is not performed here
 302        r"""The 3rd weight convolutional layer of the larger building block. """
 303        self.weighted_layer_names.append(
 304            self.full_3rd_layer_name
 305        )  # update the weighted layer names
 306        if batch_normalization:
 307            self.conv_bn3 = nn.BatchNorm2d(
 308                num_features=layer_output_channels
 309            )  # construct the batch normalization layer
 310            r"""The batch normalization (`nn.BatchNorm2d`) layer after the 3rd weighted convolutional layer. """
 311        if self.activation:
 312            self.conv_activation3 = activation_layer()  # construct the activation layer
 313            r"""The activation layer after the 3rd weighted convolutional layer. """
 314
 315        self.identity_downsample: nn.Module = (
 316            nn.Conv2d(
 317                in_channels=preceding_output_channels,
 318                out_channels=input_channels * 4,
 319                kernel_size=1,
 320                stride=overall_stride,
 321                bias=False,
 322            )
 323            if preceding_output_channels != input_channels * 4 or overall_stride != 1
 324            else None
 325        )
 326        r"""The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists. """
 327
 328    def forward(self, input: Tensor) -> tuple[Tensor, dict[str, Tensor]]:
 329        r"""The forward pass for data.
 330
 331        **Args:**
 332        - **input** (`Tensor`): the input feature maps.
 333
 334        **Returns:**
 335        - **output_feature** (`Tensor`): the output feature maps.
 336        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
 337        """
 338        activations = {}
 339
 340        identity = (
 341            self.identity_downsample(input)
 342            if self.identity_downsample is not None
 343            else input
 344        )  # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's
 345
 346        x = input
 347        x = self.conv1(x)
 348        if self.batch_normalization:
 349            x = self.conv_bn1(
 350                x
 351            )  # batch normalization can be before or after activation. We put it before activation here
 352        if self.activation:
 353            x = self.conv_activation1(x)
 354        activations[self.full_1st_layer_name] = x  # store the hidden feature
 355
 356        x = self.conv2(x)
 357        if self.batch_normalization:
 358            x = self.conv_bn2(
 359                x
 360            )  # batch normalization can be before or after activation. We put it before activation here
 361        if self.activation:
 362            x = self.conv_activation2(x)
 363        activations[self.full_2nd_layer_name] = x  # store the hidden feature
 364
 365        x = self.conv3(x)
 366        if self.batch_normalization:
 367            x = self.conv_bn3(
 368                x
 369            )  # batch normalization can be before or after activation. We put it before activation here
 370
 371        x = x + identity
 372        if self.activation:
 373            x = self.conv_activation3(x)  # activation after the shortcut connection
 374        activations[self.full_3rd_layer_name] = x  # store the hidden feature
 375
 376        output_feature = x
 377
 378        return output_feature, activations
 379
 380
 381class ResNetBase(Backbone):
 382    r"""The base class of [residual network (ResNet)](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html).
 383
 384    ResNet is a convolutional network architecture, which has 1st convolutional parameter layer and a maxpooling layer, connecting to 4 convolutional layers which contains multiple convolutional parameter layer. Each layer of the 4 are constructed from basic building blocks which are either small (`ResNetBlockSmall`) or large (`ResNetBlockLarge`). Each building block contains several convolutional parameter layers. The building blocks are connected by a skip connection which is a direct connection from the input of the block to the output of the block, and this is why it's called residual (find "shortcut connections" in the paper for more details). After the 5th convolutional layer, there are average pooling layer and a fully connected layer which connects to the CL output heads.
 385    """
 386
 387    def __init__(
 388        self,
 389        input_channels: int,
 390        building_block_type: ResNetBlockSmall | ResNetBlockLarge,
 391        building_block_nums: tuple[int, int, int, int],
 392        building_block_preceding_output_channels: tuple[int, int, int, int],
 393        building_block_input_channels: tuple[int, int, int, int],
 394        output_dim: int,
 395        activation_layer: nn.Module | None = nn.ReLU,
 396        batch_normalization: bool = True,
 397        bias: bool = False,
 398        **kwargs,
 399    ) -> None:
 400        r"""Construct and initialize the ResNet backbone network.
 401
 402        **Args:**
 403        - **input_channels** (`int`): the number of channels of input. Image data are kept channels when going in ResNet. Note that convolutional networks require number of input channels instead of dimension.
 404        - **building_block_type** (`ResNetBlockSmall` | `ResNetBlockLarge`): the type of building block used in the ResNet.
 405        - **building_block_nums** (`tuple[int, int, int, int]`): the number of building blocks in the 2-5 convolutional layer correspondingly.
 406        - **building_block_preceding_output_channels** (`tuple[int, int, int, int]`): the number of channels of preceding output of each building block in the 2-5 convolutional layer correspondingly.
 407        - **building_block_input_channels** (`tuple[int, int, int, int]`): the number of channels of input of each building block in the 2-5 convolutional layer correspondingly.
 408        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
 409        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 410        - **batch_normalization** (`bool`): whether to use batch normalization after convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 411        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
 412        - **kwargs**: Reserved for multiple inheritance.
 413        """
 414        super().__init__(output_dim=output_dim, **kwargs)
 415
 416        self.batch_normalization: bool = batch_normalization
 417        r"""Whether to use batch normalization after convolutional layers."""
 418        self.activation: bool = activation_layer is not None
 419        r"""Whether to use activation function after convolutional layers."""
 420
 421        # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc)
 422        layer_input_channels = input_channels  # the input channels of the 1st convolutional layer, which receive the input of the entire network
 423        layer_output_channels = 64  # the output channels of the 1st convolutional layer
 424        self.conv1 = nn.Conv2d(
 425            in_channels=layer_input_channels,
 426            out_channels=layer_output_channels,
 427            kernel_size=7,
 428            stride=2,
 429            padding=3,
 430            bias=bias,
 431        )  # construct the 1st weight convolutional layer of the entire ResNet
 432        r"""The 1st weight convolutional layer of the entire ResNet. It  is always with fixed kernel size 7x7, stride 2, and padding 3. """
 433        self.weighted_layer_names.append("conv1")  # collect the layer name to be masked
 434        if self.batch_normalization:
 435            self.conv_bn1 = nn.BatchNorm2d(
 436                num_features=layer_output_channels
 437            )  # construct the batch normalization layer
 438            r"""The batch normalization (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """
 439        if self.activation:
 440            self.conv_activation1 = activation_layer()  # construct the activation layer
 441            r"""The activation layer after the 1st weighted convolutional layer. """
 442
 443        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)  #
 444        r"""The max pooling layer which is laid in between 1st and 2nd convolutional layers with kernel size 3x3, stride 2. """
 445
 446        # construct the 2nd convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
 447        self.conv2x = self._multiple_blocks(
 448            layer_name="conv2x",
 449            building_block_type=building_block_type,
 450            building_block_num=building_block_nums[0],
 451            preceding_output_channels=building_block_preceding_output_channels[0],
 452            input_channels=building_block_input_channels[0],
 453            overall_stride=1,  # the overall stride of the 2nd convolutional layer should be 1, as the preceding maxpooling layer has stride 2, which already made 112x112 -> 56x56. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
 454            activation_layer=activation_layer,
 455            batch_normalization=batch_normalization,
 456            bias=bias,
 457        )
 458        r"""The 2nd convolutional layer of the ResNet, which contains multiple blocks. """
 459
 460        # construct the 3rd convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
 461        self.conv3x = self._multiple_blocks(
 462            layer_name="conv3x",
 463            building_block_type=building_block_type,
 464            building_block_num=building_block_nums[1],
 465            preceding_output_channels=building_block_preceding_output_channels[1],
 466            input_channels=building_block_input_channels[1],
 467            overall_stride=2,  # the overall stride of the 3rd convolutional layer should be 2, making 56x56 -> 28x28. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
 468            activation_layer=activation_layer,
 469            batch_normalization=batch_normalization,
 470            bias=bias,
 471        )
 472        r"""The 3rd convolutional layer of the ResNet, which contains multiple blocks. """
 473
 474        # construct the 4th convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
 475        self.conv4x = self._multiple_blocks(
 476            layer_name="conv4x",
 477            building_block_type=building_block_type,
 478            building_block_num=building_block_nums[2],
 479            preceding_output_channels=building_block_preceding_output_channels[2],
 480            input_channels=building_block_input_channels[2],
 481            overall_stride=2,  # the overall stride of the 4th convolutional layer should be 2, making 28x28 -> 14x14. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
 482            activation_layer=activation_layer,
 483            batch_normalization=batch_normalization,
 484            bias=bias,
 485        )
 486        r"""The 4th convolutional layer of the ResNet, which contains multiple blocks. """
 487
 488        # construct the 5th convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
 489        self.conv5x = self._multiple_blocks(
 490            layer_name="conv5x",
 491            building_block_type=building_block_type,
 492            building_block_num=building_block_nums[3],
 493            preceding_output_channels=building_block_preceding_output_channels[3],
 494            input_channels=building_block_input_channels[3],
 495            overall_stride=2,  # the overall stride of the 2nd convolutional layer should be 2, making 14x14 -> 7x7. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
 496            activation_layer=activation_layer,
 497            batch_normalization=batch_normalization,
 498            bias=bias,
 499        )
 500        r"""The 5th convolutional layer of the ResNet, which contains multiple blocks. """
 501
 502        self.avepool = nn.AdaptiveAvgPool2d(output_size=(1, 1))
 503        r"""The average pooling layer which is laid after the convolutional layers and before feature maps are flattened. """
 504
 505    def _multiple_blocks(
 506        self,
 507        layer_name: str,
 508        building_block_type: ResNetBlockSmall | ResNetBlockLarge,
 509        building_block_num: int,
 510        preceding_output_channels: int,
 511        input_channels: int,
 512        overall_stride: int,
 513        activation_layer: nn.Module | None = nn.ReLU,
 514        batch_normalization: bool = True,
 515        bias: bool = False,
 516    ) -> nn.Sequential:
 517        r"""Construct a layer consisting of multiple building blocks. It's used to construct the 2-5 convolutional layers of the ResNet.
 518
 519        The "shortcut connections" are performed between the input and output of each building block:
 520        1. If the input and output of the building block have exactly the same dimensions (including number of channels and size), add the input to the output.
 521        2. If the input and output of the building block have different dimensions (including number of channels and size), add the input to the output after a convolutional layer to make the dimensions match.
 522
 523        **Args:**
 524        - **layer_name** (`str`): pass the name of this multi-building-block layer to construct the full name of each weighted convolutional layer.
 525        - **building_block_type** (`ResNetBlockSmall` | `ResNetBlockLarge`): the type of the building block.
 526        - **building_block_num** (`int`): the number of building blocks in this multi-building-block layer.
 527        - **preceding_output_channels** (`int`): the number of channels of preceding output of this entire multi-building-block layer.
 528        - **input_channels** (`int`): the number of channels of input of this multi-building-block layer.
 529        - **overall_stride** (`int`): the overall stride of the building blocks. This stride is performed at the 1st building block where other building blocks remain their own overall stride of 1. Inside that building block, this stride is performed at certain convolutional layer in the building block where other convolutional layers remain stride of 1:
 530            - For `ResNetBlockSmall`, it performs at the 2nd (last) layer.
 531            - For `ResNetBlockLarge`, it performs at the 2nd (middle) layer.
 532        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 533        - **batch_normalization** (`bool`): whether to use batch normalization after convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 534        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
 535
 536        **Returns:**
 537        - **layer** (`nn.Sequential`): the constructed layer consisting of multiple building blocks.
 538        """
 539
 540        layer = []
 541
 542        for block_idx in range(building_block_num):
 543            layer.append(
 544                building_block_type(
 545                    outer_layer_name=layer_name,
 546                    block_idx=block_idx,
 547                    preceding_output_channels=(
 548                        preceding_output_channels
 549                        if block_idx == 0
 550                        else (
 551                            input_channels
 552                            if building_block_type == ResNetBlockSmall
 553                            else input_channels * 4
 554                        )
 555                    ),  # if it's the 1st block in this multi-building-block layer, it should be the number of channels of the preceding output of this entire multi-building-block layer. Otherwise, it should be the number of channels from last building block where the number of channels is 4 times of the input channels for `ResNetBlockLarge` than `ResNetBlockSmall`.
 556                    input_channels=input_channels,
 557                    overall_stride=(
 558                        overall_stride if block_idx == 0 else 1
 559                    ),  # only perform the overall stride at the 1st block in this multi-building-block layer
 560                    activation_layer=activation_layer,
 561                    batch_normalization=batch_normalization,
 562                    bias=bias,
 563                )
 564            )
 565
 566            self.weighted_layer_names += layer[
 567                -1
 568            ].weighted_layer_names  # collect the weighted layer names in the blocks and sync to the weighted layer names list in the outer network
 569
 570        return nn.Sequential(*layer)
 571
 572    def forward(
 573        self, input: Tensor, stage: str = None
 574    ) -> tuple[Tensor, dict[str, Tensor]]:
 575        r"""The forward pass for data. It is the same for all tasks.
 576
 577        **Args:**
 578        - **input** (`Tensor`): the input tensor from data.
 579
 580        **Returns:**
 581        - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation.
 582        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
 583        """
 584        batch_size = input.size(0)
 585        activations = {}
 586
 587        x = input
 588
 589        x = self.conv1(x)
 590        if self.batch_normalization:
 591            x = self.conv_bn1(x)
 592        if self.activation:
 593            x = self.conv_activation1(x)
 594        activations["conv1"] = x
 595
 596        x = self.maxpool(x)
 597
 598        for block in self.conv2x:
 599            x, activations_block = block(x)
 600            activations.update(activations_block)  # store the hidden feature
 601        for block in self.conv3x:
 602            x, activations_block = block(x)
 603            activations.update(activations_block)  # store the hidden feature
 604        for block in self.conv4x:
 605            x, activations_block = block(x)
 606            activations.update(activations_block)  # store the hidden feature
 607        for block in self.conv5x:
 608            x, activations_block = block(x)
 609            activations.update(activations_block)  # store the hidden feature
 610
 611        x = self.avepool(x)
 612
 613        output_feature = x.view(batch_size, -1)  # flatten before going through heads
 614
 615        return output_feature, activations
 616
 617
 618class ResNet18(ResNetBase):
 619    r"""ResNet-18 backbone network.
 620
 621    This is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 18 weight convolutional layers in total. See Table 1 in the paper for details.
 622    """
 623
 624    def __init__(
 625        self,
 626        input_channels: int,
 627        output_dim: int,
 628        activation_layer: nn.Module | None = nn.ReLU,
 629        batch_normalization: bool = True,
 630        bias: bool = False,
 631        pretrained_weights: str | None = None,
 632        **kwargs,
 633    ) -> None:
 634        r"""Construct and initialize the ResNet-18 backbone network.
 635
 636        **Args:**
 637        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
 638        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
 639        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 640        - **batch_normalization** (`bool`): whether to use batch normalization after convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 641        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
 642        - **pretrained_weights** (`str`): the name of pretrained weights to be loaded. See [TorchVision docs](https://pytorch.org/vision/main/models.html). If `None`, no pretrained weights are loaded. Default `None`.
 643        - **kwargs**: Reserved for multiple inheritance.
 644        """
 645        super().__init__(
 646            input_channels=input_channels,
 647            building_block_type=ResNetBlockSmall,  # use the smaller building block for ResNet-18
 648            building_block_nums=(2, 2, 2, 2),
 649            building_block_preceding_output_channels=(64, 64, 128, 256),
 650            building_block_input_channels=(64, 128, 256, 512),
 651            output_dim=output_dim,
 652            activation_layer=activation_layer,
 653            batch_normalization=batch_normalization,
 654            bias=bias,
 655            **kwargs,
 656        )
 657
 658        if pretrained_weights is not None:
 659            # load the pretrained weights from TorchVision
 660            torchvision_resnet18_state_dict = torchvision.models.resnet18(
 661                weights=pretrained_weights
 662            ).state_dict()
 663
 664            # mapping from torchvision resnet18 state dict to our ResNet18 state dict
 665            state_dict_converted = {}
 666            for key, value in torchvision_resnet18_state_dict.items():
 667                if RESNET18_STATE_DICT_MAPPING[key] is not None:
 668                    state_dict_converted[RESNET18_STATE_DICT_MAPPING[key]] = value
 669
 670            self.load_state_dict(state_dict_converted, strict=False)
 671
 672
 673class ResNet34(ResNetBase):
 674    r"""ResNet-34 backbone network.
 675
 676    This is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 34 weight convolutional layers in total. See Table 1 in the paper for details.
 677    """
 678
 679    def __init__(
 680        self,
 681        input_channels: int,
 682        output_dim: int,
 683        activation_layer: nn.Module | None = nn.ReLU,
 684        batch_normalization: bool = True,
 685        bias: bool = False,
 686        **kwargs,
 687    ) -> None:
 688        r"""Construct and initialize the ResNet-34 backbone network.
 689
 690        **Args:**
 691        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
 692        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
 693        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 694        - **batch_normalization** (`bool`): whether to use batch normalization after convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 695        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
 696        - **kwargs**: Reserved for multiple inheritance.
 697        """
 698        super().__init__(
 699            input_channels=input_channels,
 700            building_block_type=ResNetBlockSmall,  # use the smaller building block for ResNet-34
 701            building_block_nums=(3, 4, 6, 3),
 702            building_block_preceding_output_channels=(64, 64, 128, 256),
 703            building_block_input_channels=(64, 128, 256, 512),
 704            output_dim=output_dim,
 705            activation_layer=activation_layer,
 706            batch_normalization=batch_normalization,
 707            bias=bias,
 708            **kwargs,
 709        )
 710
 711
 712class ResNet50(ResNetBase):
 713    r"""ResNet-50 backbone network.
 714
 715    This is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 50 weight convolutional layers in total. See Table 1 in the paper for details.
 716    """
 717
 718    def __init__(
 719        self,
 720        input_channels: int,
 721        output_dim: int,
 722        activation_layer: nn.Module | None = nn.ReLU,
 723        batch_normalization: bool = True,
 724        bias: bool = False,
 725        **kwargs,
 726    ) -> None:
 727        r"""Construct and initialize the ResNet-50 backbone network.
 728
 729        **Args:**
 730        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
 731        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
 732        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 733        - **batch_normalization** (`bool`): whether to use batch normalization after convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 734        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
 735        - **kwargs**: Reserved for multiple inheritance.
 736        """
 737        super().__init__(
 738            input_channels=input_channels,
 739            building_block_type=ResNetBlockLarge,  # use the larger building block for ResNet-50
 740            building_block_nums=(3, 4, 6, 3),
 741            building_block_preceding_output_channels=(64, 256, 512, 1024),
 742            building_block_input_channels=(64, 128, 256, 512),
 743            output_dim=output_dim,
 744            activation_layer=activation_layer,
 745            batch_normalization=batch_normalization,
 746            bias=bias,
 747            **kwargs,
 748        )
 749
 750
 751class ResNet101(ResNetBase):
 752    r"""ResNet-101 backbone network.
 753
 754    This is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 101 weight convolutional layers in total. See Table 1 in the paper for details.
 755    """
 756
 757    def __init__(
 758        self,
 759        input_channels: int,
 760        output_dim: int,
 761        activation_layer: nn.Module | None = nn.ReLU,
 762        batch_normalization: bool = True,
 763        bias: bool = False,
 764        **kwargs,
 765    ) -> None:
 766        r"""Construct and initialize the ResNet-101 backbone network.
 767
 768        **Args:**
 769        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
 770        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
 771        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 772        - **batch_normalization** (`bool`): whether to use batch normalization after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 773        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
 774        - **kwargs**: Reserved for multiple inheritance.
 775        """
 776        super().__init__(
 777            input_channels=input_channels,
 778            building_block_type=ResNetBlockLarge,  # use the larger building block for ResNet-101
 779            building_block_nums=(3, 4, 23, 3),
 780            building_block_preceding_output_channels=(64, 256, 512, 1024),
 781            building_block_input_channels=(64, 128, 256, 512),
 782            output_dim=output_dim,
 783            activation_layer=activation_layer,
 784            batch_normalization=batch_normalization,
 785            bias=bias,
 786            **kwargs,
 787        )
 788
 789
 790class ResNet152(ResNetBase):
 791    r"""ResNet-152 backbone network.
 792
 793    This is the largest architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 152 weight convolutional layers in total. See Table 1 in the paper for details.
 794    """
 795
 796    def __init__(
 797        self,
 798        input_channels: int,
 799        output_dim: int,
 800        activation_layer: nn.Module | None = nn.ReLU,
 801        batch_normalization: bool = True,
 802        bias: bool = False,
 803        **kwargs,
 804    ) -> None:
 805        r"""Construct and initialize the ResNet-152 backbone network.
 806
 807        **Args:**
 808        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
 809        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
 810        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 811        - **batch_normalization** (`bool`): whether to use batch normalization after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 812        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
 813        - **kwargs**: Reserved for multiple inheritance.
 814        """
 815        super().__init__(
 816            input_channels=input_channels,
 817            building_block_type=ResNetBlockLarge,  # use the larger building block for ResNet-152
 818            building_block_nums=(3, 8, 36, 3),
 819            building_block_preceding_output_channels=(64, 256, 512, 1024),
 820            building_block_input_channels=(64, 128, 256, 512),
 821            output_dim=output_dim,
 822            activation_layer=activation_layer,
 823            batch_normalization=batch_normalization,
 824            bias=bias,
 825            **kwargs,
 826        )
 827
 828
 829class CLResNet18(CLBackbone, ResNet18):
 830    r"""The ResNet-18 backbone network for continual learning.
 831
 832    This is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 18 weight convolutional layers in total. See Table 1 in the paper for details.
 833    """
 834
 835    def __init__(
 836        self,
 837        input_channels: int,
 838        output_dim: int,
 839        activation_layer: nn.Module | None = nn.ReLU,
 840        batch_normalization: bool = True,
 841        bias: bool = False,
 842        pretrained_weights: str | None = None,
 843        **kwargs,
 844    ) -> None:
 845        r"""Construct and initialize the ResNet-18 backbone network for continual learning.
 846
 847        **Args:**
 848        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
 849        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
 850        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 851        - **batch_normalization** (`bool`): whether to use batch normalization after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 852        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
 853        - **pretrained_weights** (`str`): the name of pretrained weights to be loaded. See [TorchVision docs](https://pytorch.org/vision/main/models.html). If `None`, no pretrained weights are loaded. Default `None`.
 854        - **kwargs**: Reserved for multiple inheritance.
 855        """
 856        super().__init__(
 857            input_channels=input_channels,
 858            output_dim=output_dim,
 859            activation_layer=activation_layer,
 860            batch_normalization=batch_normalization,
 861            bias=bias,
 862            pretrained_weights=pretrained_weights,
 863            **kwargs,
 864        )
 865
 866    def forward(
 867        self, input: Tensor, stage: str = None, task_id: int | None = None
 868    ) -> tuple[Tensor, dict[str, Tensor]]:
 869        r"""The forward pass for data. It is the same for all tasks.
 870
 871        **Args:**
 872        - **input** (`Tensor`): the input tensor from data.
 873        - **stage** (`str` | `None`): Unused. Kept for API compatibility with other backbones.
 874        - **task_id** (`int` | `None`): Unused. Kept for API compatibility with other continual learning backbones.
 875
 876        **Returns:**
 877        - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation.
 878        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
 879        """
 880        return ResNet18.forward(self, input, stage)  # call the ResNet18 forward method
 881
 882
 883class CLResNet34(CLBackbone, ResNet34):
 884    r"""The ResNet-34 backbone network for continual learning.
 885
 886    This is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 34 weight convolutional layers in total. See Table 1 in the paper for details.
 887    """
 888
 889    def __init__(
 890        self,
 891        input_channels: int,
 892        output_dim: int,
 893        activation_layer: nn.Module | None = nn.ReLU,
 894        batch_normalization: bool = True,
 895        bias: bool = False,
 896        pretrained_weights: str | None = None,
 897        **kwargs,
 898    ) -> None:
 899        r"""Construct and initialize the ResNet-34 backbone network for continual learning.
 900
 901        **Args:**
 902        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
 903        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
 904        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 905        - **batch_normalization** (`bool`): whether to use batch normalization after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 906        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
 907        - **pretrained_weights** (`str`): the name of pretrained weights to be loaded. See [TorchVision docs](https://pytorch.org/vision/main/models.html). If `None`, no pretrained weights are loaded. Default `None`.
 908        - **kwargs**: Reserved for multiple inheritance.
 909        """
 910        super().__init__(
 911            input_channels=input_channels,
 912            output_dim=output_dim,
 913            activation_layer=activation_layer,
 914            batch_normalization=batch_normalization,
 915            bias=bias,
 916            pretrained_weights=pretrained_weights,
 917            **kwargs,
 918        )
 919
 920    def forward(
 921        self, input: Tensor, stage: str = None, task_id: int | None = None
 922    ) -> tuple[Tensor, dict[str, Tensor]]:
 923        r"""The forward pass for data. It is the same for all tasks.
 924
 925        **Args:**
 926        - **input** (`Tensor`): the input tensor from data.
 927        - **stage** (`str` | `None`): Unused. Kept for API compatibility with other backbones.
 928        - **task_id** (`int` | `None`): Unused. Kept for API compatibility with other continual learning backbones.
 929
 930        **Returns:**
 931        - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation.
 932        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
 933        """
 934        return ResNet34.forward(self, input, stage)  # call the ResNet34 forward method
 935
 936
 937class CLResNet50(CLBackbone, ResNet50):
 938    r"""The ResNet-50 backbone network for continual learning.
 939
 940    This is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 50 weight convolutional layers in total. See Table 1 in the paper for details.
 941    """
 942
 943    def __init__(
 944        self,
 945        input_channels: int,
 946        output_dim: int,
 947        activation_layer: nn.Module | None = nn.ReLU,
 948        batch_normalization: bool = True,
 949        bias: bool = False,
 950        pretrained_weights: str | None = None,
 951        **kwargs,
 952    ) -> None:
 953        r"""Construct and initialize the ResNet-50 backbone network for continual learning.
 954
 955        **Args:**
 956        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
 957        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
 958        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 959        - **batch_normalization** (`bool`): whether to use batch normalization after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 960        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
 961        - **pretrained_weights** (`str`): the name of pretrained weights to be loaded. See [TorchVision docs](https://pytorch.org/vision/main/models.html). If `None`, no pretrained weights are loaded. Default `None`.
 962        - **kwargs**: Reserved for multiple inheritance.
 963        """
 964        super().__init__(
 965            input_channels=input_channels,
 966            output_dim=output_dim,
 967            activation_layer=activation_layer,
 968            batch_normalization=batch_normalization,
 969            bias=bias,
 970            pretrained_weights=pretrained_weights,
 971            **kwargs,
 972        )
 973
 974    def forward(
 975        self, input: Tensor, stage: str = None, task_id: int | None = None
 976    ) -> tuple[Tensor, dict[str, Tensor]]:
 977        r"""The forward pass for data. It is the same for all tasks.
 978
 979        **Args:**
 980        - **input** (`Tensor`): the input tensor from data.
 981        - **stage** (`str` | `None`): Unused. Kept for API compatibility with other backbones.
 982        - **task_id** (`int` | `None`): Unused. Kept for API compatibility with other continual learning backbones.
 983
 984        **Returns:**
 985        - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation.
 986        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
 987        """
 988        return ResNet50.forward(self, input, stage)  # call the ResNet50 forward method
 989
 990
 991class CLResNet101(CLBackbone, ResNet101):
 992    r"""The ResNet-101 backbone network for continual learning.
 993
 994    This is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 101 weight convolutional layers in total. See Table 1 in the paper for details.
 995    """
 996
 997    def __init__(
 998        self,
 999        input_channels: int,
1000        output_dim: int,
1001        activation_layer: nn.Module | None = nn.ReLU,
1002        batch_normalization: bool = True,
1003        bias: bool = False,
1004        pretrained_weights: str | None = None,
1005        **kwargs,
1006    ) -> None:
1007        r"""Construct and initialize the ResNet-101 backbone network for continual learning.
1008
1009        **Args:**
1010        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
1011        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1012        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1013        - **batch_normalization** (`bool`): whether to use batch normalization after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
1014        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
1015        - **pretrained_weights** (`str`): the name of pretrained weights to be loaded. See [TorchVision docs](https://pytorch.org/vision/main/models.html). If `None`, no pretrained weights are loaded. Default `None`.
1016        - **kwargs**: Reserved for multiple inheritance.
1017        """
1018        super().__init__(
1019            input_channels=input_channels,
1020            output_dim=output_dim,
1021            activation_layer=activation_layer,
1022            batch_normalization=batch_normalization,
1023            bias=bias,
1024            pretrained_weights=pretrained_weights,
1025            **kwargs,
1026        )
1027
1028    def forward(
1029        self, input: Tensor, stage: str = None, task_id: int | None = None
1030    ) -> tuple[Tensor, dict[str, Tensor]]:
1031        r"""The forward pass for data. It is the same for all tasks.
1032
1033        **Args:**
1034        - **input** (`Tensor`): the input tensor from data.
1035        - **stage** (`str` | `None`): Unused. Kept for API compatibility with other backbones.
1036        - **task_id** (`int` | `None`): Unused. Kept for API compatibility with other continual learning backbones.
1037
1038        **Returns:**
1039        - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation.
1040        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
1041        """
1042        return ResNet101.forward(
1043            self, input, stage
1044        )  # call the ResNet101 forward method
1045
1046
1047class CLResNet152(CLBackbone, ResNet152):
1048    r"""The ResNet-152 backbone network for continual learning.
1049
1050    This is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 152 weight convolutional layers in total. See Table 1 in the paper for details.
1051    """
1052
1053    def __init__(
1054        self,
1055        input_channels: int,
1056        output_dim: int,
1057        activation_layer: nn.Module | None = nn.ReLU,
1058        batch_normalization: bool = True,
1059        bias: bool = False,
1060        pretrained_weights: str | None = None,
1061        **kwargs,
1062    ) -> None:
1063        r"""Construct and initialize the ResNet-152 backbone network for continual learning.
1064
1065        **Args:**
1066        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
1067        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1068        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1069        - **batch_normalization** (`bool`): whether to use batch normalization after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
1070        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
1071        - **pretrained_weights** (`str`): the name of pretrained weights to be loaded. See [TorchVision docs](https://pytorch.org/vision/main/models.html). If `None`, no pretrained weights are loaded. Default `None`.
1072        - **kwargs**: Reserved for multiple inheritance.
1073        """
1074        super().__init__(
1075            input_channels=input_channels,
1076            output_dim=output_dim,
1077            activation_layer=activation_layer,
1078            batch_normalization=batch_normalization,
1079            bias=bias,
1080            pretrained_weights=pretrained_weights,
1081            **kwargs,
1082        )
1083
1084    def forward(
1085        self, input: Tensor, stage: str = None, task_id: int | None = None
1086    ) -> tuple[Tensor, dict[str, Tensor]]:
1087        r"""The forward pass for data. It is the same for all tasks.
1088
1089        **Args:**
1090        - **input** (`Tensor`): the input tensor from data.
1091        - **stage** (`str` | `None`): Unused. Kept for API compatibility with other backbones.
1092        - **task_id** (`int` | `None`): Unused. Kept for API compatibility with other continual learning backbones.
1093
1094        **Returns:**
1095        - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation.
1096        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
1097        """
1098        return ResNet152.forward(
1099            self, input, stage
1100        )  # call the ResNet152 forward method
class ResNetBlockSmall(clarena.backbones.base.Backbone):
 34class ResNetBlockSmall(Backbone):
 35    r"""The smaller building block for ResNet-18/34.
 36
 37    It consists of 2 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (left) in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html).
 38    """
 39
 40    def __init__(
 41        self,
 42        outer_layer_name: str,
 43        block_idx: int,
 44        preceding_output_channels: int,
 45        input_channels: int,
 46        overall_stride: int,
 47        activation_layer: nn.Module | None = nn.ReLU,
 48        batch_normalization: bool = True,
 49        bias: bool = False,
 50        output_dim: int | None = None,
 51        **kwargs,
 52    ) -> None:
 53        r"""Construct and initialize the smaller building block.
 54
 55        **Args:**
 56        - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
 57        - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
 58        - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block.
 59        - **input_channels** (`int`): the number of channels of input of this building block.
 60        - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (last) convolutional layer where the 1st convolutional layer remain stride of 1.
 61        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 62        - **batch_normalization** (`bool`): whether to use batch normalization after convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 63        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
 64        - **output_dim** (`int` | `None`): placeholder to be compatible with Backbone API. Not used in building blocks.
 65        - **kwargs**: Reserved for multiple inheritance.
 66        """
 67        super().__init__(output_dim=None, **kwargs)
 68
 69        self.batch_normalization: bool = batch_normalization
 70        r"""Whether to use batch normalization after convolutional layers."""
 71        self.activation: bool = activation_layer is not None
 72        r"""Whether to use activation function after convolutional layers."""
 73
 74        self.full_1st_layer_name = f"{outer_layer_name}/{block_idx}/conv1"
 75        r"""Format and store full name of the 1st weighted convolutional layer. """
 76        self.full_2nd_layer_name = f"{outer_layer_name}/{block_idx}/conv2"
 77        r"""Format and store full name of the 2nd weighted convolutional layer. """
 78
 79        # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc)
 80        layer_input_channels = preceding_output_channels  # the input channels of the 1st convolutional layer, which receive the output channels of the preceding module
 81        layer_output_channels = (
 82            input_channels  # the output channels of the 1st convolutional layer
 83        )
 84        self.conv1 = nn.Conv2d(
 85            in_channels=layer_input_channels,
 86            out_channels=layer_output_channels,
 87            kernel_size=3,
 88            stride=1,
 89            padding=1,
 90            bias=bias,
 91        )  # construct the 1st weight convolutional layer of the smaller building block. Overall stride is not performed here
 92        r"""The 1st weight convolutional layer of the smaller building block. """
 93        self.weighted_layer_names.append(
 94            self.full_1st_layer_name
 95        )  # update the weighted layer names
 96        if self.batch_normalization:
 97            self.conv_bn1 = nn.BatchNorm2d(
 98                num_features=layer_output_channels
 99            )  # construct the batch normalization layer
100            r"""The batch normalization (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """
101        if self.activation:
102            self.conv_activation1 = activation_layer()  # construct the activation layer
103            r"""The activation layer after the 1st weighted convolutional layer. """
104
105        # construct the 2nd weighted convolutional layer and attached layers (batchnorm, activation, etc)
106        layer_input_channels = input_channels  # the input channels of the 2nd convolutional layer, which is `input_channels`, the same as the output channels of the 1st convolutional layer
107        layer_output_channels = (
108            input_channels * 1
109        )  # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion)
110        self.conv2 = nn.Conv2d(
111            in_channels=layer_input_channels,
112            out_channels=layer_output_channels,
113            kernel_size=3,
114            stride=overall_stride,
115            padding=1,
116            bias=bias,
117        )  # construct the 2nd weight convolutional layer of the smaller building block. Overall stride is performed here
118        r"""The 2nd weight convolutional layer of the smaller building block. """
119        self.weighted_layer_names.append(
120            self.full_2nd_layer_name
121        )  # update the weighted layer names
122        if batch_normalization:
123            self.conv_bn2 = nn.BatchNorm2d(
124                num_features=layer_output_channels
125            )  # construct the batch normalization layer
126            r"""The batch normalization (`nn.BatchNorm2d`) layer after the 2nd weighted convolutional layer. """
127        if self.activation:
128            self.conv_activation2 = activation_layer()  # construct the activation layer
129            r"""The activation layer after the 2nd weighted convolutional layer. """
130
131        self.identity_downsample: nn.Module = (
132            nn.Conv2d(
133                in_channels=preceding_output_channels,
134                out_channels=input_channels,
135                kernel_size=1,
136                stride=overall_stride,
137                bias=False,
138            )
139            if preceding_output_channels != input_channels or overall_stride != 1
140            else None
141        )  # construct the identity downsample function
142        r"""The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists. """
143
144    def forward(self, input: Tensor) -> tuple[Tensor, dict[str, Tensor]]:
145        r"""The forward pass for data.
146
147        **Args:**
148        - **input** (`Tensor`): the input feature maps.
149
150        **Returns:**
151        - **output_feature** (`Tensor`): the output feature maps.
152        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
153        """
154        activations = {}
155
156        identity = (
157            self.identity_downsample(input)
158            if self.identity_downsample is not None
159            else input
160        )  # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's
161
162        x = input
163        x = self.conv1(x)
164        if self.batch_normalization:
165            x = self.conv_bn1(
166                x
167            )  # batch normalization can be before or after activation. We put it before activation here
168        if self.activation:
169            x = self.conv_activation1(x)
170        activations[self.full_1st_layer_name] = x  # store the hidden feature
171
172        x = self.conv2(x)
173        if self.batch_normalization:
174            x = self.conv_bn2(
175                x
176            )  # batch normalization can be before or after activation. We put it before activation here
177
178        x = x + identity
179        if self.activation:
180            x = self.conv_activation2(x)  # activation after the shortcut connection
181        activations[self.full_2nd_layer_name] = x  # store the hidden feature
182
183        output_feature = x
184
185        return output_feature, activations

The smaller building block for ResNet-18/34.

It consists of 2 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (left) in the original ResNet paper.

ResNetBlockSmall( outer_layer_name: str, block_idx: int, preceding_output_channels: int, input_channels: int, overall_stride: int, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, batch_normalization: bool = True, bias: bool = False, output_dim: int | None = None, **kwargs)
 40    def __init__(
 41        self,
 42        outer_layer_name: str,
 43        block_idx: int,
 44        preceding_output_channels: int,
 45        input_channels: int,
 46        overall_stride: int,
 47        activation_layer: nn.Module | None = nn.ReLU,
 48        batch_normalization: bool = True,
 49        bias: bool = False,
 50        output_dim: int | None = None,
 51        **kwargs,
 52    ) -> None:
 53        r"""Construct and initialize the smaller building block.
 54
 55        **Args:**
 56        - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
 57        - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
 58        - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block.
 59        - **input_channels** (`int`): the number of channels of input of this building block.
 60        - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (last) convolutional layer where the 1st convolutional layer remain stride of 1.
 61        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 62        - **batch_normalization** (`bool`): whether to use batch normalization after convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 63        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
 64        - **output_dim** (`int` | `None`): placeholder to be compatible with Backbone API. Not used in building blocks.
 65        - **kwargs**: Reserved for multiple inheritance.
 66        """
 67        super().__init__(output_dim=None, **kwargs)
 68
 69        self.batch_normalization: bool = batch_normalization
 70        r"""Whether to use batch normalization after convolutional layers."""
 71        self.activation: bool = activation_layer is not None
 72        r"""Whether to use activation function after convolutional layers."""
 73
 74        self.full_1st_layer_name = f"{outer_layer_name}/{block_idx}/conv1"
 75        r"""Format and store full name of the 1st weighted convolutional layer. """
 76        self.full_2nd_layer_name = f"{outer_layer_name}/{block_idx}/conv2"
 77        r"""Format and store full name of the 2nd weighted convolutional layer. """
 78
 79        # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc)
 80        layer_input_channels = preceding_output_channels  # the input channels of the 1st convolutional layer, which receive the output channels of the preceding module
 81        layer_output_channels = (
 82            input_channels  # the output channels of the 1st convolutional layer
 83        )
 84        self.conv1 = nn.Conv2d(
 85            in_channels=layer_input_channels,
 86            out_channels=layer_output_channels,
 87            kernel_size=3,
 88            stride=1,
 89            padding=1,
 90            bias=bias,
 91        )  # construct the 1st weight convolutional layer of the smaller building block. Overall stride is not performed here
 92        r"""The 1st weight convolutional layer of the smaller building block. """
 93        self.weighted_layer_names.append(
 94            self.full_1st_layer_name
 95        )  # update the weighted layer names
 96        if self.batch_normalization:
 97            self.conv_bn1 = nn.BatchNorm2d(
 98                num_features=layer_output_channels
 99            )  # construct the batch normalization layer
100            r"""The batch normalization (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """
101        if self.activation:
102            self.conv_activation1 = activation_layer()  # construct the activation layer
103            r"""The activation layer after the 1st weighted convolutional layer. """
104
105        # construct the 2nd weighted convolutional layer and attached layers (batchnorm, activation, etc)
106        layer_input_channels = input_channels  # the input channels of the 2nd convolutional layer, which is `input_channels`, the same as the output channels of the 1st convolutional layer
107        layer_output_channels = (
108            input_channels * 1
109        )  # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion)
110        self.conv2 = nn.Conv2d(
111            in_channels=layer_input_channels,
112            out_channels=layer_output_channels,
113            kernel_size=3,
114            stride=overall_stride,
115            padding=1,
116            bias=bias,
117        )  # construct the 2nd weight convolutional layer of the smaller building block. Overall stride is performed here
118        r"""The 2nd weight convolutional layer of the smaller building block. """
119        self.weighted_layer_names.append(
120            self.full_2nd_layer_name
121        )  # update the weighted layer names
122        if batch_normalization:
123            self.conv_bn2 = nn.BatchNorm2d(
124                num_features=layer_output_channels
125            )  # construct the batch normalization layer
126            r"""The batch normalization (`nn.BatchNorm2d`) layer after the 2nd weighted convolutional layer. """
127        if self.activation:
128            self.conv_activation2 = activation_layer()  # construct the activation layer
129            r"""The activation layer after the 2nd weighted convolutional layer. """
130
131        self.identity_downsample: nn.Module = (
132            nn.Conv2d(
133                in_channels=preceding_output_channels,
134                out_channels=input_channels,
135                kernel_size=1,
136                stride=overall_stride,
137                bias=False,
138            )
139            if preceding_output_channels != input_channels or overall_stride != 1
140            else None
141        )  # construct the identity downsample function
142        r"""The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists. """

Construct and initialize the smaller building block.

Args:

  • outer_layer_name (str): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
  • block_idx (int): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
  • preceding_output_channels (int): the number of channels of preceding output of this particular building block.
  • input_channels (int): the number of channels of input of this building block.
  • overall_stride (int): the overall stride of this building block. This stride is performed at 2nd (last) convolutional layer where the 1st convolutional layer remain stride of 1.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • batch_normalization (bool): whether to use batch normalization after convolutional layers. Default True, same as what the original ResNet paper does.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalization are doing the similar thing with bias.
  • output_dim (int | None): placeholder to be compatible with Backbone API. Not used in building blocks.
  • kwargs: Reserved for multiple inheritance.
batch_normalization: bool

Whether to use batch normalization after convolutional layers.

activation: bool

Whether to use activation function after convolutional layers.

full_1st_layer_name

Format and store full name of the 1st weighted convolutional layer.

full_2nd_layer_name

Format and store full name of the 2nd weighted convolutional layer.

conv1

The 1st weight convolutional layer of the smaller building block.

conv2

The 2nd weight convolutional layer of the smaller building block.

identity_downsample: torch.nn.modules.module.Module

The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists.

def forward( self, input: torch.Tensor) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
144    def forward(self, input: Tensor) -> tuple[Tensor, dict[str, Tensor]]:
145        r"""The forward pass for data.
146
147        **Args:**
148        - **input** (`Tensor`): the input feature maps.
149
150        **Returns:**
151        - **output_feature** (`Tensor`): the output feature maps.
152        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
153        """
154        activations = {}
155
156        identity = (
157            self.identity_downsample(input)
158            if self.identity_downsample is not None
159            else input
160        )  # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's
161
162        x = input
163        x = self.conv1(x)
164        if self.batch_normalization:
165            x = self.conv_bn1(
166                x
167            )  # batch normalization can be before or after activation. We put it before activation here
168        if self.activation:
169            x = self.conv_activation1(x)
170        activations[self.full_1st_layer_name] = x  # store the hidden feature
171
172        x = self.conv2(x)
173        if self.batch_normalization:
174            x = self.conv_bn2(
175                x
176            )  # batch normalization can be before or after activation. We put it before activation here
177
178        x = x + identity
179        if self.activation:
180            x = self.conv_activation2(x)  # activation after the shortcut connection
181        activations[self.full_2nd_layer_name] = x  # store the hidden feature
182
183        output_feature = x
184
185        return output_feature, activations

The forward pass for data.

Args:

  • input (Tensor): the input feature maps.

Returns:

  • output_feature (Tensor): the output feature maps.
  • activations (dict[str, Tensor]): the hidden features (after activation) in each weighted layer. Keys (str) are the weighted layer names and values (Tensor) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
class ResNetBlockLarge(clarena.backbones.base.Backbone):
188class ResNetBlockLarge(Backbone):
189    r"""The larger building block for ResNet-50/101/152. It is referred to "bottleneck" building block in the paper.
190
191    It consists of 3 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (right) in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html).
192    """
193
194    def __init__(
195        self,
196        outer_layer_name: str,
197        block_idx: int,
198        preceding_output_channels: int,
199        input_channels: int,
200        overall_stride: int,
201        activation_layer: nn.Module | None = nn.ReLU,
202        batch_normalization: bool = True,
203        bias: bool = False,
204        output_dim: int | None = None,
205        **kwargs,
206    ) -> None:
207        r"""Construct and initialize the larger building block.
208
209        **Args:**
210        - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
211        - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
212        - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block.
213        - **input_channels** (`int`): the number of channels of input of this building block.
214        - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (middle) convolutional layer where 1st and 3rd convolutional layers remain stride of 1.
215        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
216        - **batch_normalization** (`bool`): whether to use batch normalization after convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
217        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
218        - **output_dim** (`int` | `None`): placeholder to be compatible with Backbone API. Not used in building blocks.
219        - **kwargs**: Reserved for multiple inheritance.
220        """
221        super().__init__(output_dim=None, **kwargs)
222
223        self.batch_normalization: bool = batch_normalization
224        r"""Whether to use batch normalization after convolutional layers."""
225        self.activation: bool = activation_layer is not None
226        r"""Whether to use activation function after convolutional layers."""
227
228        self.full_1st_layer_name = f"{outer_layer_name}/{block_idx}/conv1"
229        r"""Format and store full name of the 1st weighted convolutional layer. """
230        self.full_2nd_layer_name = f"{outer_layer_name}/{block_idx}/conv2"
231        r"""Format and store full name of the 2nd weighted convolutional layer. """
232        self.full_3rd_layer_name = f"{outer_layer_name}/{block_idx}/conv3"
233        r"""Format and store full name of the 3rd weighted convolutional layer. """
234
235        # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc)
236        layer_input_channels = preceding_output_channels  # the input channels of the 1st convolutional layer, which receive the output channels of the preceding module
237        layer_output_channels = (
238            input_channels  # the output channels of the 1st convolutional layer
239        )
240        self.conv1 = nn.Conv2d(
241            in_channels=layer_input_channels,
242            out_channels=layer_output_channels,
243            kernel_size=1,
244            stride=1,
245            padding=0,
246            bias=bias,
247        )  # construct the 1st weight convolutional layer of the larger building block. Overall stride is not performed here
248        r"""The 1st weight convolutional layer of the larger building block. """
249        self.weighted_layer_names.append(
250            self.full_1st_layer_name
251        )  # update the weighted layer names
252        if self.batch_normalization:
253            self.conv_bn1 = nn.BatchNorm2d(
254                num_features=layer_output_channels
255            )  # construct the batch normalization layer
256            r"""The batch normalization (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """
257        if self.activation:
258            self.conv_activation1 = activation_layer()  # construct the activation layer
259            r"""The activation layer after the 1st weighted convolutional layer. """
260
261        # construct the 2nd weighted convolutional layer and attached layers (batchnorm, activation, etc)
262        layer_input_channels = input_channels  # the input channels of the 2nd convolutional layer, which is `input_channels`, the same as the output channels of the 1st convolutional layer
263        layer_output_channels = (
264            input_channels
265            * 1  # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion)
266        )
267        self.conv2 = nn.Conv2d(
268            in_channels=layer_input_channels,
269            out_channels=layer_output_channels,
270            kernel_size=3,
271            stride=overall_stride,
272            padding=1,
273            bias=bias,
274        )  # construct the 2nd weight convolutional layer of the larger building block. Overall stride is performed here
275        r"""The 2nd weight convolutional layer of the larger building block. """
276        self.weighted_layer_names.append(
277            self.full_2nd_layer_name
278        )  # update the weighted layer names
279        if self.batch_normalization:
280            self.conv_bn2 = nn.BatchNorm2d(
281                num_features=layer_output_channels
282            )  # construct the batch normalization layer
283            r"""The batch normalization (`nn.BatchNorm2d`) layer after the 2nd weighted convolutional layer. """
284        if self.activation:
285            self.conv_activation2 = activation_layer()  # construct the activation layer
286            r"""The activation layer after the 2nd weighted convolutional layer. """
287
288        # construct the 3rd weighted convolutional layer and attached layers (batchnorm, activation, etc)
289        layer_input_channels = (
290            input_channels * 1
291        )  # the input channels of the 3rd (final) convolutional layer, same as output of 2nd layer
292        layer_output_channels = (
293            input_channels * 4  # the output channels of the 3rd layer (4x expansion)
294        )
295        self.conv3 = nn.Conv2d(
296            in_channels=layer_input_channels,
297            out_channels=layer_output_channels,
298            kernel_size=1,
299            stride=1,
300            padding=0,
301            bias=bias,
302        )  # construct the 3rd weight convolutional layer of the larger building block. Overall stride is not performed here
303        r"""The 3rd weight convolutional layer of the larger building block. """
304        self.weighted_layer_names.append(
305            self.full_3rd_layer_name
306        )  # update the weighted layer names
307        if batch_normalization:
308            self.conv_bn3 = nn.BatchNorm2d(
309                num_features=layer_output_channels
310            )  # construct the batch normalization layer
311            r"""The batch normalization (`nn.BatchNorm2d`) layer after the 3rd weighted convolutional layer. """
312        if self.activation:
313            self.conv_activation3 = activation_layer()  # construct the activation layer
314            r"""The activation layer after the 3rd weighted convolutional layer. """
315
316        self.identity_downsample: nn.Module = (
317            nn.Conv2d(
318                in_channels=preceding_output_channels,
319                out_channels=input_channels * 4,
320                kernel_size=1,
321                stride=overall_stride,
322                bias=False,
323            )
324            if preceding_output_channels != input_channels * 4 or overall_stride != 1
325            else None
326        )
327        r"""The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists. """
328
329    def forward(self, input: Tensor) -> tuple[Tensor, dict[str, Tensor]]:
330        r"""The forward pass for data.
331
332        **Args:**
333        - **input** (`Tensor`): the input feature maps.
334
335        **Returns:**
336        - **output_feature** (`Tensor`): the output feature maps.
337        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
338        """
339        activations = {}
340
341        identity = (
342            self.identity_downsample(input)
343            if self.identity_downsample is not None
344            else input
345        )  # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's
346
347        x = input
348        x = self.conv1(x)
349        if self.batch_normalization:
350            x = self.conv_bn1(
351                x
352            )  # batch normalization can be before or after activation. We put it before activation here
353        if self.activation:
354            x = self.conv_activation1(x)
355        activations[self.full_1st_layer_name] = x  # store the hidden feature
356
357        x = self.conv2(x)
358        if self.batch_normalization:
359            x = self.conv_bn2(
360                x
361            )  # batch normalization can be before or after activation. We put it before activation here
362        if self.activation:
363            x = self.conv_activation2(x)
364        activations[self.full_2nd_layer_name] = x  # store the hidden feature
365
366        x = self.conv3(x)
367        if self.batch_normalization:
368            x = self.conv_bn3(
369                x
370            )  # batch normalization can be before or after activation. We put it before activation here
371
372        x = x + identity
373        if self.activation:
374            x = self.conv_activation3(x)  # activation after the shortcut connection
375        activations[self.full_3rd_layer_name] = x  # store the hidden feature
376
377        output_feature = x
378
379        return output_feature, activations

The larger building block for ResNet-50/101/152. It is referred to "bottleneck" building block in the paper.

It consists of 3 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (right) in the original ResNet paper.

ResNetBlockLarge( outer_layer_name: str, block_idx: int, preceding_output_channels: int, input_channels: int, overall_stride: int, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, batch_normalization: bool = True, bias: bool = False, output_dim: int | None = None, **kwargs)
194    def __init__(
195        self,
196        outer_layer_name: str,
197        block_idx: int,
198        preceding_output_channels: int,
199        input_channels: int,
200        overall_stride: int,
201        activation_layer: nn.Module | None = nn.ReLU,
202        batch_normalization: bool = True,
203        bias: bool = False,
204        output_dim: int | None = None,
205        **kwargs,
206    ) -> None:
207        r"""Construct and initialize the larger building block.
208
209        **Args:**
210        - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
211        - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
212        - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block.
213        - **input_channels** (`int`): the number of channels of input of this building block.
214        - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (middle) convolutional layer where 1st and 3rd convolutional layers remain stride of 1.
215        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
216        - **batch_normalization** (`bool`): whether to use batch normalization after convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
217        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
218        - **output_dim** (`int` | `None`): placeholder to be compatible with Backbone API. Not used in building blocks.
219        - **kwargs**: Reserved for multiple inheritance.
220        """
221        super().__init__(output_dim=None, **kwargs)
222
223        self.batch_normalization: bool = batch_normalization
224        r"""Whether to use batch normalization after convolutional layers."""
225        self.activation: bool = activation_layer is not None
226        r"""Whether to use activation function after convolutional layers."""
227
228        self.full_1st_layer_name = f"{outer_layer_name}/{block_idx}/conv1"
229        r"""Format and store full name of the 1st weighted convolutional layer. """
230        self.full_2nd_layer_name = f"{outer_layer_name}/{block_idx}/conv2"
231        r"""Format and store full name of the 2nd weighted convolutional layer. """
232        self.full_3rd_layer_name = f"{outer_layer_name}/{block_idx}/conv3"
233        r"""Format and store full name of the 3rd weighted convolutional layer. """
234
235        # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc)
236        layer_input_channels = preceding_output_channels  # the input channels of the 1st convolutional layer, which receive the output channels of the preceding module
237        layer_output_channels = (
238            input_channels  # the output channels of the 1st convolutional layer
239        )
240        self.conv1 = nn.Conv2d(
241            in_channels=layer_input_channels,
242            out_channels=layer_output_channels,
243            kernel_size=1,
244            stride=1,
245            padding=0,
246            bias=bias,
247        )  # construct the 1st weight convolutional layer of the larger building block. Overall stride is not performed here
248        r"""The 1st weight convolutional layer of the larger building block. """
249        self.weighted_layer_names.append(
250            self.full_1st_layer_name
251        )  # update the weighted layer names
252        if self.batch_normalization:
253            self.conv_bn1 = nn.BatchNorm2d(
254                num_features=layer_output_channels
255            )  # construct the batch normalization layer
256            r"""The batch normalization (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """
257        if self.activation:
258            self.conv_activation1 = activation_layer()  # construct the activation layer
259            r"""The activation layer after the 1st weighted convolutional layer. """
260
261        # construct the 2nd weighted convolutional layer and attached layers (batchnorm, activation, etc)
262        layer_input_channels = input_channels  # the input channels of the 2nd convolutional layer, which is `input_channels`, the same as the output channels of the 1st convolutional layer
263        layer_output_channels = (
264            input_channels
265            * 1  # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion)
266        )
267        self.conv2 = nn.Conv2d(
268            in_channels=layer_input_channels,
269            out_channels=layer_output_channels,
270            kernel_size=3,
271            stride=overall_stride,
272            padding=1,
273            bias=bias,
274        )  # construct the 2nd weight convolutional layer of the larger building block. Overall stride is performed here
275        r"""The 2nd weight convolutional layer of the larger building block. """
276        self.weighted_layer_names.append(
277            self.full_2nd_layer_name
278        )  # update the weighted layer names
279        if self.batch_normalization:
280            self.conv_bn2 = nn.BatchNorm2d(
281                num_features=layer_output_channels
282            )  # construct the batch normalization layer
283            r"""The batch normalization (`nn.BatchNorm2d`) layer after the 2nd weighted convolutional layer. """
284        if self.activation:
285            self.conv_activation2 = activation_layer()  # construct the activation layer
286            r"""The activation layer after the 2nd weighted convolutional layer. """
287
288        # construct the 3rd weighted convolutional layer and attached layers (batchnorm, activation, etc)
289        layer_input_channels = (
290            input_channels * 1
291        )  # the input channels of the 3rd (final) convolutional layer, same as output of 2nd layer
292        layer_output_channels = (
293            input_channels * 4  # the output channels of the 3rd layer (4x expansion)
294        )
295        self.conv3 = nn.Conv2d(
296            in_channels=layer_input_channels,
297            out_channels=layer_output_channels,
298            kernel_size=1,
299            stride=1,
300            padding=0,
301            bias=bias,
302        )  # construct the 3rd weight convolutional layer of the larger building block. Overall stride is not performed here
303        r"""The 3rd weight convolutional layer of the larger building block. """
304        self.weighted_layer_names.append(
305            self.full_3rd_layer_name
306        )  # update the weighted layer names
307        if batch_normalization:
308            self.conv_bn3 = nn.BatchNorm2d(
309                num_features=layer_output_channels
310            )  # construct the batch normalization layer
311            r"""The batch normalization (`nn.BatchNorm2d`) layer after the 3rd weighted convolutional layer. """
312        if self.activation:
313            self.conv_activation3 = activation_layer()  # construct the activation layer
314            r"""The activation layer after the 3rd weighted convolutional layer. """
315
316        self.identity_downsample: nn.Module = (
317            nn.Conv2d(
318                in_channels=preceding_output_channels,
319                out_channels=input_channels * 4,
320                kernel_size=1,
321                stride=overall_stride,
322                bias=False,
323            )
324            if preceding_output_channels != input_channels * 4 or overall_stride != 1
325            else None
326        )
327        r"""The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists. """

Construct and initialize the larger building block.

Args:

  • outer_layer_name (str): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
  • block_idx (int): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
  • preceding_output_channels (int): the number of channels of preceding output of this particular building block.
  • input_channels (int): the number of channels of input of this building block.
  • overall_stride (int): the overall stride of this building block. This stride is performed at 2nd (middle) convolutional layer where 1st and 3rd convolutional layers remain stride of 1.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • batch_normalization (bool): whether to use batch normalization after convolutional layers. Default True, same as what the original ResNet paper does.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalization are doing the similar thing with bias.
  • output_dim (int | None): placeholder to be compatible with Backbone API. Not used in building blocks.
  • kwargs: Reserved for multiple inheritance.
batch_normalization: bool

Whether to use batch normalization after convolutional layers.

activation: bool

Whether to use activation function after convolutional layers.

full_1st_layer_name

Format and store full name of the 1st weighted convolutional layer.

full_2nd_layer_name

Format and store full name of the 2nd weighted convolutional layer.

full_3rd_layer_name

Format and store full name of the 3rd weighted convolutional layer.

conv1

The 1st weight convolutional layer of the larger building block.

conv2

The 2nd weight convolutional layer of the larger building block.

conv3

The 3rd weight convolutional layer of the larger building block.

identity_downsample: torch.nn.modules.module.Module

The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists.

def forward( self, input: torch.Tensor) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
329    def forward(self, input: Tensor) -> tuple[Tensor, dict[str, Tensor]]:
330        r"""The forward pass for data.
331
332        **Args:**
333        - **input** (`Tensor`): the input feature maps.
334
335        **Returns:**
336        - **output_feature** (`Tensor`): the output feature maps.
337        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
338        """
339        activations = {}
340
341        identity = (
342            self.identity_downsample(input)
343            if self.identity_downsample is not None
344            else input
345        )  # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's
346
347        x = input
348        x = self.conv1(x)
349        if self.batch_normalization:
350            x = self.conv_bn1(
351                x
352            )  # batch normalization can be before or after activation. We put it before activation here
353        if self.activation:
354            x = self.conv_activation1(x)
355        activations[self.full_1st_layer_name] = x  # store the hidden feature
356
357        x = self.conv2(x)
358        if self.batch_normalization:
359            x = self.conv_bn2(
360                x
361            )  # batch normalization can be before or after activation. We put it before activation here
362        if self.activation:
363            x = self.conv_activation2(x)
364        activations[self.full_2nd_layer_name] = x  # store the hidden feature
365
366        x = self.conv3(x)
367        if self.batch_normalization:
368            x = self.conv_bn3(
369                x
370            )  # batch normalization can be before or after activation. We put it before activation here
371
372        x = x + identity
373        if self.activation:
374            x = self.conv_activation3(x)  # activation after the shortcut connection
375        activations[self.full_3rd_layer_name] = x  # store the hidden feature
376
377        output_feature = x
378
379        return output_feature, activations

The forward pass for data.

Args:

  • input (Tensor): the input feature maps.

Returns:

  • output_feature (Tensor): the output feature maps.
  • activations (dict[str, Tensor]): the hidden features (after activation) in each weighted layer. Keys (str) are the weighted layer names and values (Tensor) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
class ResNetBase(clarena.backbones.base.Backbone):
382class ResNetBase(Backbone):
383    r"""The base class of [residual network (ResNet)](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html).
384
385    ResNet is a convolutional network architecture, which has 1st convolutional parameter layer and a maxpooling layer, connecting to 4 convolutional layers which contains multiple convolutional parameter layer. Each layer of the 4 are constructed from basic building blocks which are either small (`ResNetBlockSmall`) or large (`ResNetBlockLarge`). Each building block contains several convolutional parameter layers. The building blocks are connected by a skip connection which is a direct connection from the input of the block to the output of the block, and this is why it's called residual (find "shortcut connections" in the paper for more details). After the 5th convolutional layer, there are average pooling layer and a fully connected layer which connects to the CL output heads.
386    """
387
388    def __init__(
389        self,
390        input_channels: int,
391        building_block_type: ResNetBlockSmall | ResNetBlockLarge,
392        building_block_nums: tuple[int, int, int, int],
393        building_block_preceding_output_channels: tuple[int, int, int, int],
394        building_block_input_channels: tuple[int, int, int, int],
395        output_dim: int,
396        activation_layer: nn.Module | None = nn.ReLU,
397        batch_normalization: bool = True,
398        bias: bool = False,
399        **kwargs,
400    ) -> None:
401        r"""Construct and initialize the ResNet backbone network.
402
403        **Args:**
404        - **input_channels** (`int`): the number of channels of input. Image data are kept channels when going in ResNet. Note that convolutional networks require number of input channels instead of dimension.
405        - **building_block_type** (`ResNetBlockSmall` | `ResNetBlockLarge`): the type of building block used in the ResNet.
406        - **building_block_nums** (`tuple[int, int, int, int]`): the number of building blocks in the 2-5 convolutional layer correspondingly.
407        - **building_block_preceding_output_channels** (`tuple[int, int, int, int]`): the number of channels of preceding output of each building block in the 2-5 convolutional layer correspondingly.
408        - **building_block_input_channels** (`tuple[int, int, int, int]`): the number of channels of input of each building block in the 2-5 convolutional layer correspondingly.
409        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
410        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
411        - **batch_normalization** (`bool`): whether to use batch normalization after convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
412        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
413        - **kwargs**: Reserved for multiple inheritance.
414        """
415        super().__init__(output_dim=output_dim, **kwargs)
416
417        self.batch_normalization: bool = batch_normalization
418        r"""Whether to use batch normalization after convolutional layers."""
419        self.activation: bool = activation_layer is not None
420        r"""Whether to use activation function after convolutional layers."""
421
422        # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc)
423        layer_input_channels = input_channels  # the input channels of the 1st convolutional layer, which receive the input of the entire network
424        layer_output_channels = 64  # the output channels of the 1st convolutional layer
425        self.conv1 = nn.Conv2d(
426            in_channels=layer_input_channels,
427            out_channels=layer_output_channels,
428            kernel_size=7,
429            stride=2,
430            padding=3,
431            bias=bias,
432        )  # construct the 1st weight convolutional layer of the entire ResNet
433        r"""The 1st weight convolutional layer of the entire ResNet. It  is always with fixed kernel size 7x7, stride 2, and padding 3. """
434        self.weighted_layer_names.append("conv1")  # collect the layer name to be masked
435        if self.batch_normalization:
436            self.conv_bn1 = nn.BatchNorm2d(
437                num_features=layer_output_channels
438            )  # construct the batch normalization layer
439            r"""The batch normalization (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """
440        if self.activation:
441            self.conv_activation1 = activation_layer()  # construct the activation layer
442            r"""The activation layer after the 1st weighted convolutional layer. """
443
444        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)  #
445        r"""The max pooling layer which is laid in between 1st and 2nd convolutional layers with kernel size 3x3, stride 2. """
446
447        # construct the 2nd convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
448        self.conv2x = self._multiple_blocks(
449            layer_name="conv2x",
450            building_block_type=building_block_type,
451            building_block_num=building_block_nums[0],
452            preceding_output_channels=building_block_preceding_output_channels[0],
453            input_channels=building_block_input_channels[0],
454            overall_stride=1,  # the overall stride of the 2nd convolutional layer should be 1, as the preceding maxpooling layer has stride 2, which already made 112x112 -> 56x56. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
455            activation_layer=activation_layer,
456            batch_normalization=batch_normalization,
457            bias=bias,
458        )
459        r"""The 2nd convolutional layer of the ResNet, which contains multiple blocks. """
460
461        # construct the 3rd convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
462        self.conv3x = self._multiple_blocks(
463            layer_name="conv3x",
464            building_block_type=building_block_type,
465            building_block_num=building_block_nums[1],
466            preceding_output_channels=building_block_preceding_output_channels[1],
467            input_channels=building_block_input_channels[1],
468            overall_stride=2,  # the overall stride of the 3rd convolutional layer should be 2, making 56x56 -> 28x28. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
469            activation_layer=activation_layer,
470            batch_normalization=batch_normalization,
471            bias=bias,
472        )
473        r"""The 3rd convolutional layer of the ResNet, which contains multiple blocks. """
474
475        # construct the 4th convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
476        self.conv4x = self._multiple_blocks(
477            layer_name="conv4x",
478            building_block_type=building_block_type,
479            building_block_num=building_block_nums[2],
480            preceding_output_channels=building_block_preceding_output_channels[2],
481            input_channels=building_block_input_channels[2],
482            overall_stride=2,  # the overall stride of the 4th convolutional layer should be 2, making 28x28 -> 14x14. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
483            activation_layer=activation_layer,
484            batch_normalization=batch_normalization,
485            bias=bias,
486        )
487        r"""The 4th convolutional layer of the ResNet, which contains multiple blocks. """
488
489        # construct the 5th convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
490        self.conv5x = self._multiple_blocks(
491            layer_name="conv5x",
492            building_block_type=building_block_type,
493            building_block_num=building_block_nums[3],
494            preceding_output_channels=building_block_preceding_output_channels[3],
495            input_channels=building_block_input_channels[3],
496            overall_stride=2,  # the overall stride of the 2nd convolutional layer should be 2, making 14x14 -> 7x7. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
497            activation_layer=activation_layer,
498            batch_normalization=batch_normalization,
499            bias=bias,
500        )
501        r"""The 5th convolutional layer of the ResNet, which contains multiple blocks. """
502
503        self.avepool = nn.AdaptiveAvgPool2d(output_size=(1, 1))
504        r"""The average pooling layer which is laid after the convolutional layers and before feature maps are flattened. """
505
506    def _multiple_blocks(
507        self,
508        layer_name: str,
509        building_block_type: ResNetBlockSmall | ResNetBlockLarge,
510        building_block_num: int,
511        preceding_output_channels: int,
512        input_channels: int,
513        overall_stride: int,
514        activation_layer: nn.Module | None = nn.ReLU,
515        batch_normalization: bool = True,
516        bias: bool = False,
517    ) -> nn.Sequential:
518        r"""Construct a layer consisting of multiple building blocks. It's used to construct the 2-5 convolutional layers of the ResNet.
519
520        The "shortcut connections" are performed between the input and output of each building block:
521        1. If the input and output of the building block have exactly the same dimensions (including number of channels and size), add the input to the output.
522        2. If the input and output of the building block have different dimensions (including number of channels and size), add the input to the output after a convolutional layer to make the dimensions match.
523
524        **Args:**
525        - **layer_name** (`str`): pass the name of this multi-building-block layer to construct the full name of each weighted convolutional layer.
526        - **building_block_type** (`ResNetBlockSmall` | `ResNetBlockLarge`): the type of the building block.
527        - **building_block_num** (`int`): the number of building blocks in this multi-building-block layer.
528        - **preceding_output_channels** (`int`): the number of channels of preceding output of this entire multi-building-block layer.
529        - **input_channels** (`int`): the number of channels of input of this multi-building-block layer.
530        - **overall_stride** (`int`): the overall stride of the building blocks. This stride is performed at the 1st building block where other building blocks remain their own overall stride of 1. Inside that building block, this stride is performed at certain convolutional layer in the building block where other convolutional layers remain stride of 1:
531            - For `ResNetBlockSmall`, it performs at the 2nd (last) layer.
532            - For `ResNetBlockLarge`, it performs at the 2nd (middle) layer.
533        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
534        - **batch_normalization** (`bool`): whether to use batch normalization after convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
535        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
536
537        **Returns:**
538        - **layer** (`nn.Sequential`): the constructed layer consisting of multiple building blocks.
539        """
540
541        layer = []
542
543        for block_idx in range(building_block_num):
544            layer.append(
545                building_block_type(
546                    outer_layer_name=layer_name,
547                    block_idx=block_idx,
548                    preceding_output_channels=(
549                        preceding_output_channels
550                        if block_idx == 0
551                        else (
552                            input_channels
553                            if building_block_type == ResNetBlockSmall
554                            else input_channels * 4
555                        )
556                    ),  # if it's the 1st block in this multi-building-block layer, it should be the number of channels of the preceding output of this entire multi-building-block layer. Otherwise, it should be the number of channels from last building block where the number of channels is 4 times of the input channels for `ResNetBlockLarge` than `ResNetBlockSmall`.
557                    input_channels=input_channels,
558                    overall_stride=(
559                        overall_stride if block_idx == 0 else 1
560                    ),  # only perform the overall stride at the 1st block in this multi-building-block layer
561                    activation_layer=activation_layer,
562                    batch_normalization=batch_normalization,
563                    bias=bias,
564                )
565            )
566
567            self.weighted_layer_names += layer[
568                -1
569            ].weighted_layer_names  # collect the weighted layer names in the blocks and sync to the weighted layer names list in the outer network
570
571        return nn.Sequential(*layer)
572
573    def forward(
574        self, input: Tensor, stage: str = None
575    ) -> tuple[Tensor, dict[str, Tensor]]:
576        r"""The forward pass for data. It is the same for all tasks.
577
578        **Args:**
579        - **input** (`Tensor`): the input tensor from data.
580
581        **Returns:**
582        - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation.
583        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
584        """
585        batch_size = input.size(0)
586        activations = {}
587
588        x = input
589
590        x = self.conv1(x)
591        if self.batch_normalization:
592            x = self.conv_bn1(x)
593        if self.activation:
594            x = self.conv_activation1(x)
595        activations["conv1"] = x
596
597        x = self.maxpool(x)
598
599        for block in self.conv2x:
600            x, activations_block = block(x)
601            activations.update(activations_block)  # store the hidden feature
602        for block in self.conv3x:
603            x, activations_block = block(x)
604            activations.update(activations_block)  # store the hidden feature
605        for block in self.conv4x:
606            x, activations_block = block(x)
607            activations.update(activations_block)  # store the hidden feature
608        for block in self.conv5x:
609            x, activations_block = block(x)
610            activations.update(activations_block)  # store the hidden feature
611
612        x = self.avepool(x)
613
614        output_feature = x.view(batch_size, -1)  # flatten before going through heads
615
616        return output_feature, activations

The base class of residual network (ResNet).

ResNet is a convolutional network architecture, which has 1st convolutional parameter layer and a maxpooling layer, connecting to 4 convolutional layers which contains multiple convolutional parameter layer. Each layer of the 4 are constructed from basic building blocks which are either small (ResNetBlockSmall) or large (ResNetBlockLarge). Each building block contains several convolutional parameter layers. The building blocks are connected by a skip connection which is a direct connection from the input of the block to the output of the block, and this is why it's called residual (find "shortcut connections" in the paper for more details). After the 5th convolutional layer, there are average pooling layer and a fully connected layer which connects to the CL output heads.

ResNetBase( input_channels: int, building_block_type: ResNetBlockSmall | ResNetBlockLarge, building_block_nums: tuple[int, int, int, int], building_block_preceding_output_channels: tuple[int, int, int, int], building_block_input_channels: tuple[int, int, int, int], output_dim: int, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, batch_normalization: bool = True, bias: bool = False, **kwargs)
388    def __init__(
389        self,
390        input_channels: int,
391        building_block_type: ResNetBlockSmall | ResNetBlockLarge,
392        building_block_nums: tuple[int, int, int, int],
393        building_block_preceding_output_channels: tuple[int, int, int, int],
394        building_block_input_channels: tuple[int, int, int, int],
395        output_dim: int,
396        activation_layer: nn.Module | None = nn.ReLU,
397        batch_normalization: bool = True,
398        bias: bool = False,
399        **kwargs,
400    ) -> None:
401        r"""Construct and initialize the ResNet backbone network.
402
403        **Args:**
404        - **input_channels** (`int`): the number of channels of input. Image data are kept channels when going in ResNet. Note that convolutional networks require number of input channels instead of dimension.
405        - **building_block_type** (`ResNetBlockSmall` | `ResNetBlockLarge`): the type of building block used in the ResNet.
406        - **building_block_nums** (`tuple[int, int, int, int]`): the number of building blocks in the 2-5 convolutional layer correspondingly.
407        - **building_block_preceding_output_channels** (`tuple[int, int, int, int]`): the number of channels of preceding output of each building block in the 2-5 convolutional layer correspondingly.
408        - **building_block_input_channels** (`tuple[int, int, int, int]`): the number of channels of input of each building block in the 2-5 convolutional layer correspondingly.
409        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
410        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
411        - **batch_normalization** (`bool`): whether to use batch normalization after convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
412        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
413        - **kwargs**: Reserved for multiple inheritance.
414        """
415        super().__init__(output_dim=output_dim, **kwargs)
416
417        self.batch_normalization: bool = batch_normalization
418        r"""Whether to use batch normalization after convolutional layers."""
419        self.activation: bool = activation_layer is not None
420        r"""Whether to use activation function after convolutional layers."""
421
422        # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc)
423        layer_input_channels = input_channels  # the input channels of the 1st convolutional layer, which receive the input of the entire network
424        layer_output_channels = 64  # the output channels of the 1st convolutional layer
425        self.conv1 = nn.Conv2d(
426            in_channels=layer_input_channels,
427            out_channels=layer_output_channels,
428            kernel_size=7,
429            stride=2,
430            padding=3,
431            bias=bias,
432        )  # construct the 1st weight convolutional layer of the entire ResNet
433        r"""The 1st weight convolutional layer of the entire ResNet. It  is always with fixed kernel size 7x7, stride 2, and padding 3. """
434        self.weighted_layer_names.append("conv1")  # collect the layer name to be masked
435        if self.batch_normalization:
436            self.conv_bn1 = nn.BatchNorm2d(
437                num_features=layer_output_channels
438            )  # construct the batch normalization layer
439            r"""The batch normalization (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """
440        if self.activation:
441            self.conv_activation1 = activation_layer()  # construct the activation layer
442            r"""The activation layer after the 1st weighted convolutional layer. """
443
444        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)  #
445        r"""The max pooling layer which is laid in between 1st and 2nd convolutional layers with kernel size 3x3, stride 2. """
446
447        # construct the 2nd convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
448        self.conv2x = self._multiple_blocks(
449            layer_name="conv2x",
450            building_block_type=building_block_type,
451            building_block_num=building_block_nums[0],
452            preceding_output_channels=building_block_preceding_output_channels[0],
453            input_channels=building_block_input_channels[0],
454            overall_stride=1,  # the overall stride of the 2nd convolutional layer should be 1, as the preceding maxpooling layer has stride 2, which already made 112x112 -> 56x56. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
455            activation_layer=activation_layer,
456            batch_normalization=batch_normalization,
457            bias=bias,
458        )
459        r"""The 2nd convolutional layer of the ResNet, which contains multiple blocks. """
460
461        # construct the 3rd convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
462        self.conv3x = self._multiple_blocks(
463            layer_name="conv3x",
464            building_block_type=building_block_type,
465            building_block_num=building_block_nums[1],
466            preceding_output_channels=building_block_preceding_output_channels[1],
467            input_channels=building_block_input_channels[1],
468            overall_stride=2,  # the overall stride of the 3rd convolutional layer should be 2, making 56x56 -> 28x28. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
469            activation_layer=activation_layer,
470            batch_normalization=batch_normalization,
471            bias=bias,
472        )
473        r"""The 3rd convolutional layer of the ResNet, which contains multiple blocks. """
474
475        # construct the 4th convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
476        self.conv4x = self._multiple_blocks(
477            layer_name="conv4x",
478            building_block_type=building_block_type,
479            building_block_num=building_block_nums[2],
480            preceding_output_channels=building_block_preceding_output_channels[2],
481            input_channels=building_block_input_channels[2],
482            overall_stride=2,  # the overall stride of the 4th convolutional layer should be 2, making 28x28 -> 14x14. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
483            activation_layer=activation_layer,
484            batch_normalization=batch_normalization,
485            bias=bias,
486        )
487        r"""The 4th convolutional layer of the ResNet, which contains multiple blocks. """
488
489        # construct the 5th convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
490        self.conv5x = self._multiple_blocks(
491            layer_name="conv5x",
492            building_block_type=building_block_type,
493            building_block_num=building_block_nums[3],
494            preceding_output_channels=building_block_preceding_output_channels[3],
495            input_channels=building_block_input_channels[3],
496            overall_stride=2,  # the overall stride of the 2nd convolutional layer should be 2, making 14x14 -> 7x7. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
497            activation_layer=activation_layer,
498            batch_normalization=batch_normalization,
499            bias=bias,
500        )
501        r"""The 5th convolutional layer of the ResNet, which contains multiple blocks. """
502
503        self.avepool = nn.AdaptiveAvgPool2d(output_size=(1, 1))
504        r"""The average pooling layer which is laid after the convolutional layers and before feature maps are flattened. """

Construct and initialize the ResNet backbone network.

Args:

  • input_channels (int): the number of channels of input. Image data are kept channels when going in ResNet. Note that convolutional networks require number of input channels instead of dimension.
  • building_block_type (ResNetBlockSmall | ResNetBlockLarge): the type of building block used in the ResNet.
  • building_block_nums (tuple[int, int, int, int]): the number of building blocks in the 2-5 convolutional layer correspondingly.
  • building_block_preceding_output_channels (tuple[int, int, int, int]): the number of channels of preceding output of each building block in the 2-5 convolutional layer correspondingly.
  • building_block_input_channels (tuple[int, int, int, int]): the number of channels of input of each building block in the 2-5 convolutional layer correspondingly.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • batch_normalization (bool): whether to use batch normalization after convolutional layers. Default True, same as what the original ResNet paper does.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalization are doing the similar thing with bias.
  • kwargs: Reserved for multiple inheritance.
batch_normalization: bool

Whether to use batch normalization after convolutional layers.

activation: bool

Whether to use activation function after convolutional layers.

conv1

The 1st weight convolutional layer of the entire ResNet. It is always with fixed kernel size 7x7, stride 2, and padding 3.

maxpool

The max pooling layer which is laid in between 1st and 2nd convolutional layers with kernel size 3x3, stride 2.

conv2x

The 2nd convolutional layer of the ResNet, which contains multiple blocks.

conv3x

The 3rd convolutional layer of the ResNet, which contains multiple blocks.

conv4x

The 4th convolutional layer of the ResNet, which contains multiple blocks.

conv5x

The 5th convolutional layer of the ResNet, which contains multiple blocks.

avepool

The average pooling layer which is laid after the convolutional layers and before feature maps are flattened.

def forward( self, input: torch.Tensor, stage: str = None) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
573    def forward(
574        self, input: Tensor, stage: str = None
575    ) -> tuple[Tensor, dict[str, Tensor]]:
576        r"""The forward pass for data. It is the same for all tasks.
577
578        **Args:**
579        - **input** (`Tensor`): the input tensor from data.
580
581        **Returns:**
582        - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation.
583        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
584        """
585        batch_size = input.size(0)
586        activations = {}
587
588        x = input
589
590        x = self.conv1(x)
591        if self.batch_normalization:
592            x = self.conv_bn1(x)
593        if self.activation:
594            x = self.conv_activation1(x)
595        activations["conv1"] = x
596
597        x = self.maxpool(x)
598
599        for block in self.conv2x:
600            x, activations_block = block(x)
601            activations.update(activations_block)  # store the hidden feature
602        for block in self.conv3x:
603            x, activations_block = block(x)
604            activations.update(activations_block)  # store the hidden feature
605        for block in self.conv4x:
606            x, activations_block = block(x)
607            activations.update(activations_block)  # store the hidden feature
608        for block in self.conv5x:
609            x, activations_block = block(x)
610            activations.update(activations_block)  # store the hidden feature
611
612        x = self.avepool(x)
613
614        output_feature = x.view(batch_size, -1)  # flatten before going through heads
615
616        return output_feature, activations

The forward pass for data. It is the same for all tasks.

Args:

  • input (Tensor): the input tensor from data.

Returns:

  • output_feature (Tensor): the output feature tensor to be passed into heads. This is the main target of backpropagation.
  • activations (dict[str, Tensor]): the hidden features (after activation) in each weighted layer. Keys (str) are the weighted layer names and values (Tensor) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
class ResNet18(ResNetBase):
619class ResNet18(ResNetBase):
620    r"""ResNet-18 backbone network.
621
622    This is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 18 weight convolutional layers in total. See Table 1 in the paper for details.
623    """
624
625    def __init__(
626        self,
627        input_channels: int,
628        output_dim: int,
629        activation_layer: nn.Module | None = nn.ReLU,
630        batch_normalization: bool = True,
631        bias: bool = False,
632        pretrained_weights: str | None = None,
633        **kwargs,
634    ) -> None:
635        r"""Construct and initialize the ResNet-18 backbone network.
636
637        **Args:**
638        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
639        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
640        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
641        - **batch_normalization** (`bool`): whether to use batch normalization after convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
642        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
643        - **pretrained_weights** (`str`): the name of pretrained weights to be loaded. See [TorchVision docs](https://pytorch.org/vision/main/models.html). If `None`, no pretrained weights are loaded. Default `None`.
644        - **kwargs**: Reserved for multiple inheritance.
645        """
646        super().__init__(
647            input_channels=input_channels,
648            building_block_type=ResNetBlockSmall,  # use the smaller building block for ResNet-18
649            building_block_nums=(2, 2, 2, 2),
650            building_block_preceding_output_channels=(64, 64, 128, 256),
651            building_block_input_channels=(64, 128, 256, 512),
652            output_dim=output_dim,
653            activation_layer=activation_layer,
654            batch_normalization=batch_normalization,
655            bias=bias,
656            **kwargs,
657        )
658
659        if pretrained_weights is not None:
660            # load the pretrained weights from TorchVision
661            torchvision_resnet18_state_dict = torchvision.models.resnet18(
662                weights=pretrained_weights
663            ).state_dict()
664
665            # mapping from torchvision resnet18 state dict to our ResNet18 state dict
666            state_dict_converted = {}
667            for key, value in torchvision_resnet18_state_dict.items():
668                if RESNET18_STATE_DICT_MAPPING[key] is not None:
669                    state_dict_converted[RESNET18_STATE_DICT_MAPPING[key]] = value
670
671            self.load_state_dict(state_dict_converted, strict=False)

ResNet-18 backbone network.

This is a smaller architecture proposed in the original ResNet paper. It consists of 18 weight convolutional layers in total. See Table 1 in the paper for details.

ResNet18( input_channels: int, output_dim: int, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, batch_normalization: bool = True, bias: bool = False, pretrained_weights: str | None = None, **kwargs)
625    def __init__(
626        self,
627        input_channels: int,
628        output_dim: int,
629        activation_layer: nn.Module | None = nn.ReLU,
630        batch_normalization: bool = True,
631        bias: bool = False,
632        pretrained_weights: str | None = None,
633        **kwargs,
634    ) -> None:
635        r"""Construct and initialize the ResNet-18 backbone network.
636
637        **Args:**
638        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
639        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
640        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
641        - **batch_normalization** (`bool`): whether to use batch normalization after convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
642        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
643        - **pretrained_weights** (`str`): the name of pretrained weights to be loaded. See [TorchVision docs](https://pytorch.org/vision/main/models.html). If `None`, no pretrained weights are loaded. Default `None`.
644        - **kwargs**: Reserved for multiple inheritance.
645        """
646        super().__init__(
647            input_channels=input_channels,
648            building_block_type=ResNetBlockSmall,  # use the smaller building block for ResNet-18
649            building_block_nums=(2, 2, 2, 2),
650            building_block_preceding_output_channels=(64, 64, 128, 256),
651            building_block_input_channels=(64, 128, 256, 512),
652            output_dim=output_dim,
653            activation_layer=activation_layer,
654            batch_normalization=batch_normalization,
655            bias=bias,
656            **kwargs,
657        )
658
659        if pretrained_weights is not None:
660            # load the pretrained weights from TorchVision
661            torchvision_resnet18_state_dict = torchvision.models.resnet18(
662                weights=pretrained_weights
663            ).state_dict()
664
665            # mapping from torchvision resnet18 state dict to our ResNet18 state dict
666            state_dict_converted = {}
667            for key, value in torchvision_resnet18_state_dict.items():
668                if RESNET18_STATE_DICT_MAPPING[key] is not None:
669                    state_dict_converted[RESNET18_STATE_DICT_MAPPING[key]] = value
670
671            self.load_state_dict(state_dict_converted, strict=False)

Construct and initialize the ResNet-18 backbone network.

Args:

  • input_channels (int): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • batch_normalization (bool): whether to use batch normalization after convolutional layers. Default True, same as what the original ResNet paper does.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalization are doing the similar thing with bias.
  • pretrained_weights (str): the name of pretrained weights to be loaded. See TorchVision docs. If None, no pretrained weights are loaded. Default None.
  • kwargs: Reserved for multiple inheritance.
class ResNet34(ResNetBase):
674class ResNet34(ResNetBase):
675    r"""ResNet-34 backbone network.
676
677    This is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 34 weight convolutional layers in total. See Table 1 in the paper for details.
678    """
679
680    def __init__(
681        self,
682        input_channels: int,
683        output_dim: int,
684        activation_layer: nn.Module | None = nn.ReLU,
685        batch_normalization: bool = True,
686        bias: bool = False,
687        **kwargs,
688    ) -> None:
689        r"""Construct and initialize the ResNet-34 backbone network.
690
691        **Args:**
692        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
693        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
694        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
695        - **batch_normalization** (`bool`): whether to use batch normalization after convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
696        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
697        - **kwargs**: Reserved for multiple inheritance.
698        """
699        super().__init__(
700            input_channels=input_channels,
701            building_block_type=ResNetBlockSmall,  # use the smaller building block for ResNet-34
702            building_block_nums=(3, 4, 6, 3),
703            building_block_preceding_output_channels=(64, 64, 128, 256),
704            building_block_input_channels=(64, 128, 256, 512),
705            output_dim=output_dim,
706            activation_layer=activation_layer,
707            batch_normalization=batch_normalization,
708            bias=bias,
709            **kwargs,
710        )

ResNet-34 backbone network.

This is a smaller architecture proposed in the original ResNet paper. It consists of 34 weight convolutional layers in total. See Table 1 in the paper for details.

ResNet34( input_channels: int, output_dim: int, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, batch_normalization: bool = True, bias: bool = False, **kwargs)
680    def __init__(
681        self,
682        input_channels: int,
683        output_dim: int,
684        activation_layer: nn.Module | None = nn.ReLU,
685        batch_normalization: bool = True,
686        bias: bool = False,
687        **kwargs,
688    ) -> None:
689        r"""Construct and initialize the ResNet-34 backbone network.
690
691        **Args:**
692        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
693        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
694        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
695        - **batch_normalization** (`bool`): whether to use batch normalization after convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
696        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
697        - **kwargs**: Reserved for multiple inheritance.
698        """
699        super().__init__(
700            input_channels=input_channels,
701            building_block_type=ResNetBlockSmall,  # use the smaller building block for ResNet-34
702            building_block_nums=(3, 4, 6, 3),
703            building_block_preceding_output_channels=(64, 64, 128, 256),
704            building_block_input_channels=(64, 128, 256, 512),
705            output_dim=output_dim,
706            activation_layer=activation_layer,
707            batch_normalization=batch_normalization,
708            bias=bias,
709            **kwargs,
710        )

Construct and initialize the ResNet-34 backbone network.

Args:

  • input_channels (int): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • batch_normalization (bool): whether to use batch normalization after convolutional layers. Default True, same as what the original ResNet paper does.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalization are doing the similar thing with bias.
  • kwargs: Reserved for multiple inheritance.
class ResNet50(ResNetBase):
713class ResNet50(ResNetBase):
714    r"""ResNet-50 backbone network.
715
716    This is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 50 weight convolutional layers in total. See Table 1 in the paper for details.
717    """
718
719    def __init__(
720        self,
721        input_channels: int,
722        output_dim: int,
723        activation_layer: nn.Module | None = nn.ReLU,
724        batch_normalization: bool = True,
725        bias: bool = False,
726        **kwargs,
727    ) -> None:
728        r"""Construct and initialize the ResNet-50 backbone network.
729
730        **Args:**
731        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
732        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
733        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
734        - **batch_normalization** (`bool`): whether to use batch normalization after convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
735        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
736        - **kwargs**: Reserved for multiple inheritance.
737        """
738        super().__init__(
739            input_channels=input_channels,
740            building_block_type=ResNetBlockLarge,  # use the larger building block for ResNet-50
741            building_block_nums=(3, 4, 6, 3),
742            building_block_preceding_output_channels=(64, 256, 512, 1024),
743            building_block_input_channels=(64, 128, 256, 512),
744            output_dim=output_dim,
745            activation_layer=activation_layer,
746            batch_normalization=batch_normalization,
747            bias=bias,
748            **kwargs,
749        )

ResNet-50 backbone network.

This is a larger architecture proposed in the original ResNet paper. It consists of 50 weight convolutional layers in total. See Table 1 in the paper for details.

ResNet50( input_channels: int, output_dim: int, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, batch_normalization: bool = True, bias: bool = False, **kwargs)
719    def __init__(
720        self,
721        input_channels: int,
722        output_dim: int,
723        activation_layer: nn.Module | None = nn.ReLU,
724        batch_normalization: bool = True,
725        bias: bool = False,
726        **kwargs,
727    ) -> None:
728        r"""Construct and initialize the ResNet-50 backbone network.
729
730        **Args:**
731        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
732        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
733        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
734        - **batch_normalization** (`bool`): whether to use batch normalization after convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
735        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
736        - **kwargs**: Reserved for multiple inheritance.
737        """
738        super().__init__(
739            input_channels=input_channels,
740            building_block_type=ResNetBlockLarge,  # use the larger building block for ResNet-50
741            building_block_nums=(3, 4, 6, 3),
742            building_block_preceding_output_channels=(64, 256, 512, 1024),
743            building_block_input_channels=(64, 128, 256, 512),
744            output_dim=output_dim,
745            activation_layer=activation_layer,
746            batch_normalization=batch_normalization,
747            bias=bias,
748            **kwargs,
749        )

Construct and initialize the ResNet-50 backbone network.

Args:

  • input_channels (int): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • batch_normalization (bool): whether to use batch normalization after convolutional layers. Default True, same as what the original ResNet paper does.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalization are doing the similar thing with bias.
  • kwargs: Reserved for multiple inheritance.
class ResNet101(ResNetBase):
752class ResNet101(ResNetBase):
753    r"""ResNet-101 backbone network.
754
755    This is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 101 weight convolutional layers in total. See Table 1 in the paper for details.
756    """
757
758    def __init__(
759        self,
760        input_channels: int,
761        output_dim: int,
762        activation_layer: nn.Module | None = nn.ReLU,
763        batch_normalization: bool = True,
764        bias: bool = False,
765        **kwargs,
766    ) -> None:
767        r"""Construct and initialize the ResNet-101 backbone network.
768
769        **Args:**
770        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
771        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
772        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
773        - **batch_normalization** (`bool`): whether to use batch normalization after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
774        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
775        - **kwargs**: Reserved for multiple inheritance.
776        """
777        super().__init__(
778            input_channels=input_channels,
779            building_block_type=ResNetBlockLarge,  # use the larger building block for ResNet-101
780            building_block_nums=(3, 4, 23, 3),
781            building_block_preceding_output_channels=(64, 256, 512, 1024),
782            building_block_input_channels=(64, 128, 256, 512),
783            output_dim=output_dim,
784            activation_layer=activation_layer,
785            batch_normalization=batch_normalization,
786            bias=bias,
787            **kwargs,
788        )

ResNet-101 backbone network.

This is a larger architecture proposed in the original ResNet paper. It consists of 101 weight convolutional layers in total. See Table 1 in the paper for details.

ResNet101( input_channels: int, output_dim: int, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, batch_normalization: bool = True, bias: bool = False, **kwargs)
758    def __init__(
759        self,
760        input_channels: int,
761        output_dim: int,
762        activation_layer: nn.Module | None = nn.ReLU,
763        batch_normalization: bool = True,
764        bias: bool = False,
765        **kwargs,
766    ) -> None:
767        r"""Construct and initialize the ResNet-101 backbone network.
768
769        **Args:**
770        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
771        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
772        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
773        - **batch_normalization** (`bool`): whether to use batch normalization after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
774        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
775        - **kwargs**: Reserved for multiple inheritance.
776        """
777        super().__init__(
778            input_channels=input_channels,
779            building_block_type=ResNetBlockLarge,  # use the larger building block for ResNet-101
780            building_block_nums=(3, 4, 23, 3),
781            building_block_preceding_output_channels=(64, 256, 512, 1024),
782            building_block_input_channels=(64, 128, 256, 512),
783            output_dim=output_dim,
784            activation_layer=activation_layer,
785            batch_normalization=batch_normalization,
786            bias=bias,
787            **kwargs,
788        )

Construct and initialize the ResNet-101 backbone network.

Args:

  • input_channels (int): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • batch_normalization (bool): whether to use batch normalization after the weight convolutional layers. Default True, same as what the original ResNet paper does.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalization are doing the similar thing with bias.
  • kwargs: Reserved for multiple inheritance.
class ResNet152(ResNetBase):
791class ResNet152(ResNetBase):
792    r"""ResNet-152 backbone network.
793
794    This is the largest architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 152 weight convolutional layers in total. See Table 1 in the paper for details.
795    """
796
797    def __init__(
798        self,
799        input_channels: int,
800        output_dim: int,
801        activation_layer: nn.Module | None = nn.ReLU,
802        batch_normalization: bool = True,
803        bias: bool = False,
804        **kwargs,
805    ) -> None:
806        r"""Construct and initialize the ResNet-152 backbone network.
807
808        **Args:**
809        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
810        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
811        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
812        - **batch_normalization** (`bool`): whether to use batch normalization after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
813        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
814        - **kwargs**: Reserved for multiple inheritance.
815        """
816        super().__init__(
817            input_channels=input_channels,
818            building_block_type=ResNetBlockLarge,  # use the larger building block for ResNet-152
819            building_block_nums=(3, 8, 36, 3),
820            building_block_preceding_output_channels=(64, 256, 512, 1024),
821            building_block_input_channels=(64, 128, 256, 512),
822            output_dim=output_dim,
823            activation_layer=activation_layer,
824            batch_normalization=batch_normalization,
825            bias=bias,
826            **kwargs,
827        )

ResNet-152 backbone network.

This is the largest architecture proposed in the original ResNet paper. It consists of 152 weight convolutional layers in total. See Table 1 in the paper for details.

ResNet152( input_channels: int, output_dim: int, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, batch_normalization: bool = True, bias: bool = False, **kwargs)
797    def __init__(
798        self,
799        input_channels: int,
800        output_dim: int,
801        activation_layer: nn.Module | None = nn.ReLU,
802        batch_normalization: bool = True,
803        bias: bool = False,
804        **kwargs,
805    ) -> None:
806        r"""Construct and initialize the ResNet-152 backbone network.
807
808        **Args:**
809        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
810        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
811        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
812        - **batch_normalization** (`bool`): whether to use batch normalization after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
813        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
814        - **kwargs**: Reserved for multiple inheritance.
815        """
816        super().__init__(
817            input_channels=input_channels,
818            building_block_type=ResNetBlockLarge,  # use the larger building block for ResNet-152
819            building_block_nums=(3, 8, 36, 3),
820            building_block_preceding_output_channels=(64, 256, 512, 1024),
821            building_block_input_channels=(64, 128, 256, 512),
822            output_dim=output_dim,
823            activation_layer=activation_layer,
824            batch_normalization=batch_normalization,
825            bias=bias,
826            **kwargs,
827        )

Construct and initialize the ResNet-152 backbone network.

Args:

  • input_channels (int): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • batch_normalization (bool): whether to use batch normalization after the weight convolutional layers. Default True, same as what the original ResNet paper does.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalization are doing the similar thing with bias.
  • kwargs: Reserved for multiple inheritance.
class CLResNet18(clarena.backbones.base.CLBackbone, ResNet18):
830class CLResNet18(CLBackbone, ResNet18):
831    r"""The ResNet-18 backbone network for continual learning.
832
833    This is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 18 weight convolutional layers in total. See Table 1 in the paper for details.
834    """
835
836    def __init__(
837        self,
838        input_channels: int,
839        output_dim: int,
840        activation_layer: nn.Module | None = nn.ReLU,
841        batch_normalization: bool = True,
842        bias: bool = False,
843        pretrained_weights: str | None = None,
844        **kwargs,
845    ) -> None:
846        r"""Construct and initialize the ResNet-18 backbone network for continual learning.
847
848        **Args:**
849        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
850        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
851        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
852        - **batch_normalization** (`bool`): whether to use batch normalization after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
853        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
854        - **pretrained_weights** (`str`): the name of pretrained weights to be loaded. See [TorchVision docs](https://pytorch.org/vision/main/models.html). If `None`, no pretrained weights are loaded. Default `None`.
855        - **kwargs**: Reserved for multiple inheritance.
856        """
857        super().__init__(
858            input_channels=input_channels,
859            output_dim=output_dim,
860            activation_layer=activation_layer,
861            batch_normalization=batch_normalization,
862            bias=bias,
863            pretrained_weights=pretrained_weights,
864            **kwargs,
865        )
866
867    def forward(
868        self, input: Tensor, stage: str = None, task_id: int | None = None
869    ) -> tuple[Tensor, dict[str, Tensor]]:
870        r"""The forward pass for data. It is the same for all tasks.
871
872        **Args:**
873        - **input** (`Tensor`): the input tensor from data.
874        - **stage** (`str` | `None`): Unused. Kept for API compatibility with other backbones.
875        - **task_id** (`int` | `None`): Unused. Kept for API compatibility with other continual learning backbones.
876
877        **Returns:**
878        - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation.
879        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
880        """
881        return ResNet18.forward(self, input, stage)  # call the ResNet18 forward method

The ResNet-18 backbone network for continual learning.

This is a smaller architecture proposed in the original ResNet paper. It consists of 18 weight convolutional layers in total. See Table 1 in the paper for details.

CLResNet18( input_channels: int, output_dim: int, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, batch_normalization: bool = True, bias: bool = False, pretrained_weights: str | None = None, **kwargs)
836    def __init__(
837        self,
838        input_channels: int,
839        output_dim: int,
840        activation_layer: nn.Module | None = nn.ReLU,
841        batch_normalization: bool = True,
842        bias: bool = False,
843        pretrained_weights: str | None = None,
844        **kwargs,
845    ) -> None:
846        r"""Construct and initialize the ResNet-18 backbone network for continual learning.
847
848        **Args:**
849        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
850        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
851        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
852        - **batch_normalization** (`bool`): whether to use batch normalization after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
853        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
854        - **pretrained_weights** (`str`): the name of pretrained weights to be loaded. See [TorchVision docs](https://pytorch.org/vision/main/models.html). If `None`, no pretrained weights are loaded. Default `None`.
855        - **kwargs**: Reserved for multiple inheritance.
856        """
857        super().__init__(
858            input_channels=input_channels,
859            output_dim=output_dim,
860            activation_layer=activation_layer,
861            batch_normalization=batch_normalization,
862            bias=bias,
863            pretrained_weights=pretrained_weights,
864            **kwargs,
865        )

Construct and initialize the ResNet-18 backbone network for continual learning.

Args:

  • input_channels (int): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • batch_normalization (bool): whether to use batch normalization after the weight convolutional layers. Default True, same as what the original ResNet paper does.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalization are doing the similar thing with bias.
  • pretrained_weights (str): the name of pretrained weights to be loaded. See TorchVision docs. If None, no pretrained weights are loaded. Default None.
  • kwargs: Reserved for multiple inheritance.
def forward( self, input: torch.Tensor, stage: str = None, task_id: int | None = None) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
867    def forward(
868        self, input: Tensor, stage: str = None, task_id: int | None = None
869    ) -> tuple[Tensor, dict[str, Tensor]]:
870        r"""The forward pass for data. It is the same for all tasks.
871
872        **Args:**
873        - **input** (`Tensor`): the input tensor from data.
874        - **stage** (`str` | `None`): Unused. Kept for API compatibility with other backbones.
875        - **task_id** (`int` | `None`): Unused. Kept for API compatibility with other continual learning backbones.
876
877        **Returns:**
878        - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation.
879        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
880        """
881        return ResNet18.forward(self, input, stage)  # call the ResNet18 forward method

The forward pass for data. It is the same for all tasks.

Args:

  • input (Tensor): the input tensor from data.
  • stage (str | None): Unused. Kept for API compatibility with other backbones.
  • task_id (int | None): Unused. Kept for API compatibility with other continual learning backbones.

Returns:

  • output_feature (Tensor): the output feature tensor to be passed into heads. This is the main target of backpropagation.
  • activations (dict[str, Tensor]): the hidden features (after activation) in each weighted layer. Keys (str) are the weighted layer names and values (Tensor) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
class CLResNet34(clarena.backbones.base.CLBackbone, ResNet34):
884class CLResNet34(CLBackbone, ResNet34):
885    r"""The ResNet-34 backbone network for continual learning.
886
887    This is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 34 weight convolutional layers in total. See Table 1 in the paper for details.
888    """
889
890    def __init__(
891        self,
892        input_channels: int,
893        output_dim: int,
894        activation_layer: nn.Module | None = nn.ReLU,
895        batch_normalization: bool = True,
896        bias: bool = False,
897        pretrained_weights: str | None = None,
898        **kwargs,
899    ) -> None:
900        r"""Construct and initialize the ResNet-34 backbone network for continual learning.
901
902        **Args:**
903        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
904        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
905        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
906        - **batch_normalization** (`bool`): whether to use batch normalization after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
907        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
908        - **pretrained_weights** (`str`): the name of pretrained weights to be loaded. See [TorchVision docs](https://pytorch.org/vision/main/models.html). If `None`, no pretrained weights are loaded. Default `None`.
909        - **kwargs**: Reserved for multiple inheritance.
910        """
911        super().__init__(
912            input_channels=input_channels,
913            output_dim=output_dim,
914            activation_layer=activation_layer,
915            batch_normalization=batch_normalization,
916            bias=bias,
917            pretrained_weights=pretrained_weights,
918            **kwargs,
919        )
920
921    def forward(
922        self, input: Tensor, stage: str = None, task_id: int | None = None
923    ) -> tuple[Tensor, dict[str, Tensor]]:
924        r"""The forward pass for data. It is the same for all tasks.
925
926        **Args:**
927        - **input** (`Tensor`): the input tensor from data.
928        - **stage** (`str` | `None`): Unused. Kept for API compatibility with other backbones.
929        - **task_id** (`int` | `None`): Unused. Kept for API compatibility with other continual learning backbones.
930
931        **Returns:**
932        - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation.
933        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
934        """
935        return ResNet34.forward(self, input, stage)  # call the ResNet34 forward method

The ResNet-34 backbone network for continual learning.

This is a smaller architecture proposed in the original ResNet paper. It consists of 34 weight convolutional layers in total. See Table 1 in the paper for details.

CLResNet34( input_channels: int, output_dim: int, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, batch_normalization: bool = True, bias: bool = False, pretrained_weights: str | None = None, **kwargs)
890    def __init__(
891        self,
892        input_channels: int,
893        output_dim: int,
894        activation_layer: nn.Module | None = nn.ReLU,
895        batch_normalization: bool = True,
896        bias: bool = False,
897        pretrained_weights: str | None = None,
898        **kwargs,
899    ) -> None:
900        r"""Construct and initialize the ResNet-34 backbone network for continual learning.
901
902        **Args:**
903        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
904        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
905        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
906        - **batch_normalization** (`bool`): whether to use batch normalization after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
907        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
908        - **pretrained_weights** (`str`): the name of pretrained weights to be loaded. See [TorchVision docs](https://pytorch.org/vision/main/models.html). If `None`, no pretrained weights are loaded. Default `None`.
909        - **kwargs**: Reserved for multiple inheritance.
910        """
911        super().__init__(
912            input_channels=input_channels,
913            output_dim=output_dim,
914            activation_layer=activation_layer,
915            batch_normalization=batch_normalization,
916            bias=bias,
917            pretrained_weights=pretrained_weights,
918            **kwargs,
919        )

Construct and initialize the ResNet-34 backbone network for continual learning.

Args:

  • input_channels (int): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • batch_normalization (bool): whether to use batch normalization after the weight convolutional layers. Default True, same as what the original ResNet paper does.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalization are doing the similar thing with bias.
  • pretrained_weights (str): the name of pretrained weights to be loaded. See TorchVision docs. If None, no pretrained weights are loaded. Default None.
  • kwargs: Reserved for multiple inheritance.
def forward( self, input: torch.Tensor, stage: str = None, task_id: int | None = None) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
921    def forward(
922        self, input: Tensor, stage: str = None, task_id: int | None = None
923    ) -> tuple[Tensor, dict[str, Tensor]]:
924        r"""The forward pass for data. It is the same for all tasks.
925
926        **Args:**
927        - **input** (`Tensor`): the input tensor from data.
928        - **stage** (`str` | `None`): Unused. Kept for API compatibility with other backbones.
929        - **task_id** (`int` | `None`): Unused. Kept for API compatibility with other continual learning backbones.
930
931        **Returns:**
932        - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation.
933        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
934        """
935        return ResNet34.forward(self, input, stage)  # call the ResNet34 forward method

The forward pass for data. It is the same for all tasks.

Args:

  • input (Tensor): the input tensor from data.
  • stage (str | None): Unused. Kept for API compatibility with other backbones.
  • task_id (int | None): Unused. Kept for API compatibility with other continual learning backbones.

Returns:

  • output_feature (Tensor): the output feature tensor to be passed into heads. This is the main target of backpropagation.
  • activations (dict[str, Tensor]): the hidden features (after activation) in each weighted layer. Keys (str) are the weighted layer names and values (Tensor) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
class CLResNet50(clarena.backbones.base.CLBackbone, ResNet50):
938class CLResNet50(CLBackbone, ResNet50):
939    r"""The ResNet-50 backbone network for continual learning.
940
941    This is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 50 weight convolutional layers in total. See Table 1 in the paper for details.
942    """
943
944    def __init__(
945        self,
946        input_channels: int,
947        output_dim: int,
948        activation_layer: nn.Module | None = nn.ReLU,
949        batch_normalization: bool = True,
950        bias: bool = False,
951        pretrained_weights: str | None = None,
952        **kwargs,
953    ) -> None:
954        r"""Construct and initialize the ResNet-50 backbone network for continual learning.
955
956        **Args:**
957        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
958        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
959        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
960        - **batch_normalization** (`bool`): whether to use batch normalization after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
961        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
962        - **pretrained_weights** (`str`): the name of pretrained weights to be loaded. See [TorchVision docs](https://pytorch.org/vision/main/models.html). If `None`, no pretrained weights are loaded. Default `None`.
963        - **kwargs**: Reserved for multiple inheritance.
964        """
965        super().__init__(
966            input_channels=input_channels,
967            output_dim=output_dim,
968            activation_layer=activation_layer,
969            batch_normalization=batch_normalization,
970            bias=bias,
971            pretrained_weights=pretrained_weights,
972            **kwargs,
973        )
974
975    def forward(
976        self, input: Tensor, stage: str = None, task_id: int | None = None
977    ) -> tuple[Tensor, dict[str, Tensor]]:
978        r"""The forward pass for data. It is the same for all tasks.
979
980        **Args:**
981        - **input** (`Tensor`): the input tensor from data.
982        - **stage** (`str` | `None`): Unused. Kept for API compatibility with other backbones.
983        - **task_id** (`int` | `None`): Unused. Kept for API compatibility with other continual learning backbones.
984
985        **Returns:**
986        - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation.
987        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
988        """
989        return ResNet50.forward(self, input, stage)  # call the ResNet50 forward method

The ResNet-50 backbone network for continual learning.

This is a smaller architecture proposed in the original ResNet paper. It consists of 50 weight convolutional layers in total. See Table 1 in the paper for details.

CLResNet50( input_channels: int, output_dim: int, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, batch_normalization: bool = True, bias: bool = False, pretrained_weights: str | None = None, **kwargs)
944    def __init__(
945        self,
946        input_channels: int,
947        output_dim: int,
948        activation_layer: nn.Module | None = nn.ReLU,
949        batch_normalization: bool = True,
950        bias: bool = False,
951        pretrained_weights: str | None = None,
952        **kwargs,
953    ) -> None:
954        r"""Construct and initialize the ResNet-50 backbone network for continual learning.
955
956        **Args:**
957        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
958        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
959        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
960        - **batch_normalization** (`bool`): whether to use batch normalization after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
961        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
962        - **pretrained_weights** (`str`): the name of pretrained weights to be loaded. See [TorchVision docs](https://pytorch.org/vision/main/models.html). If `None`, no pretrained weights are loaded. Default `None`.
963        - **kwargs**: Reserved for multiple inheritance.
964        """
965        super().__init__(
966            input_channels=input_channels,
967            output_dim=output_dim,
968            activation_layer=activation_layer,
969            batch_normalization=batch_normalization,
970            bias=bias,
971            pretrained_weights=pretrained_weights,
972            **kwargs,
973        )

Construct and initialize the ResNet-50 backbone network for continual learning.

Args:

  • input_channels (int): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • batch_normalization (bool): whether to use batch normalization after the weight convolutional layers. Default True, same as what the original ResNet paper does.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalization are doing the similar thing with bias.
  • pretrained_weights (str): the name of pretrained weights to be loaded. See TorchVision docs. If None, no pretrained weights are loaded. Default None.
  • kwargs: Reserved for multiple inheritance.
def forward( self, input: torch.Tensor, stage: str = None, task_id: int | None = None) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
975    def forward(
976        self, input: Tensor, stage: str = None, task_id: int | None = None
977    ) -> tuple[Tensor, dict[str, Tensor]]:
978        r"""The forward pass for data. It is the same for all tasks.
979
980        **Args:**
981        - **input** (`Tensor`): the input tensor from data.
982        - **stage** (`str` | `None`): Unused. Kept for API compatibility with other backbones.
983        - **task_id** (`int` | `None`): Unused. Kept for API compatibility with other continual learning backbones.
984
985        **Returns:**
986        - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation.
987        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
988        """
989        return ResNet50.forward(self, input, stage)  # call the ResNet50 forward method

The forward pass for data. It is the same for all tasks.

Args:

  • input (Tensor): the input tensor from data.
  • stage (str | None): Unused. Kept for API compatibility with other backbones.
  • task_id (int | None): Unused. Kept for API compatibility with other continual learning backbones.

Returns:

  • output_feature (Tensor): the output feature tensor to be passed into heads. This is the main target of backpropagation.
  • activations (dict[str, Tensor]): the hidden features (after activation) in each weighted layer. Keys (str) are the weighted layer names and values (Tensor) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
class CLResNet101(clarena.backbones.base.CLBackbone, ResNet101):
 992class CLResNet101(CLBackbone, ResNet101):
 993    r"""The ResNet-101 backbone network for continual learning.
 994
 995    This is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 101 weight convolutional layers in total. See Table 1 in the paper for details.
 996    """
 997
 998    def __init__(
 999        self,
1000        input_channels: int,
1001        output_dim: int,
1002        activation_layer: nn.Module | None = nn.ReLU,
1003        batch_normalization: bool = True,
1004        bias: bool = False,
1005        pretrained_weights: str | None = None,
1006        **kwargs,
1007    ) -> None:
1008        r"""Construct and initialize the ResNet-101 backbone network for continual learning.
1009
1010        **Args:**
1011        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
1012        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1013        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1014        - **batch_normalization** (`bool`): whether to use batch normalization after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
1015        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
1016        - **pretrained_weights** (`str`): the name of pretrained weights to be loaded. See [TorchVision docs](https://pytorch.org/vision/main/models.html). If `None`, no pretrained weights are loaded. Default `None`.
1017        - **kwargs**: Reserved for multiple inheritance.
1018        """
1019        super().__init__(
1020            input_channels=input_channels,
1021            output_dim=output_dim,
1022            activation_layer=activation_layer,
1023            batch_normalization=batch_normalization,
1024            bias=bias,
1025            pretrained_weights=pretrained_weights,
1026            **kwargs,
1027        )
1028
1029    def forward(
1030        self, input: Tensor, stage: str = None, task_id: int | None = None
1031    ) -> tuple[Tensor, dict[str, Tensor]]:
1032        r"""The forward pass for data. It is the same for all tasks.
1033
1034        **Args:**
1035        - **input** (`Tensor`): the input tensor from data.
1036        - **stage** (`str` | `None`): Unused. Kept for API compatibility with other backbones.
1037        - **task_id** (`int` | `None`): Unused. Kept for API compatibility with other continual learning backbones.
1038
1039        **Returns:**
1040        - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation.
1041        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
1042        """
1043        return ResNet101.forward(
1044            self, input, stage
1045        )  # call the ResNet101 forward method

The ResNet-101 backbone network for continual learning.

This is a larger architecture proposed in the original ResNet paper. It consists of 101 weight convolutional layers in total. See Table 1 in the paper for details.

CLResNet101( input_channels: int, output_dim: int, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, batch_normalization: bool = True, bias: bool = False, pretrained_weights: str | None = None, **kwargs)
 998    def __init__(
 999        self,
1000        input_channels: int,
1001        output_dim: int,
1002        activation_layer: nn.Module | None = nn.ReLU,
1003        batch_normalization: bool = True,
1004        bias: bool = False,
1005        pretrained_weights: str | None = None,
1006        **kwargs,
1007    ) -> None:
1008        r"""Construct and initialize the ResNet-101 backbone network for continual learning.
1009
1010        **Args:**
1011        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
1012        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1013        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1014        - **batch_normalization** (`bool`): whether to use batch normalization after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
1015        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
1016        - **pretrained_weights** (`str`): the name of pretrained weights to be loaded. See [TorchVision docs](https://pytorch.org/vision/main/models.html). If `None`, no pretrained weights are loaded. Default `None`.
1017        - **kwargs**: Reserved for multiple inheritance.
1018        """
1019        super().__init__(
1020            input_channels=input_channels,
1021            output_dim=output_dim,
1022            activation_layer=activation_layer,
1023            batch_normalization=batch_normalization,
1024            bias=bias,
1025            pretrained_weights=pretrained_weights,
1026            **kwargs,
1027        )

Construct and initialize the ResNet-101 backbone network for continual learning.

Args:

  • input_channels (int): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • batch_normalization (bool): whether to use batch normalization after the weight convolutional layers. Default True, same as what the original ResNet paper does.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalization are doing the similar thing with bias.
  • pretrained_weights (str): the name of pretrained weights to be loaded. See TorchVision docs. If None, no pretrained weights are loaded. Default None.
  • kwargs: Reserved for multiple inheritance.
def forward( self, input: torch.Tensor, stage: str = None, task_id: int | None = None) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
1029    def forward(
1030        self, input: Tensor, stage: str = None, task_id: int | None = None
1031    ) -> tuple[Tensor, dict[str, Tensor]]:
1032        r"""The forward pass for data. It is the same for all tasks.
1033
1034        **Args:**
1035        - **input** (`Tensor`): the input tensor from data.
1036        - **stage** (`str` | `None`): Unused. Kept for API compatibility with other backbones.
1037        - **task_id** (`int` | `None`): Unused. Kept for API compatibility with other continual learning backbones.
1038
1039        **Returns:**
1040        - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation.
1041        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
1042        """
1043        return ResNet101.forward(
1044            self, input, stage
1045        )  # call the ResNet101 forward method

The forward pass for data. It is the same for all tasks.

Args:

  • input (Tensor): the input tensor from data.
  • stage (str | None): Unused. Kept for API compatibility with other backbones.
  • task_id (int | None): Unused. Kept for API compatibility with other continual learning backbones.

Returns:

  • output_feature (Tensor): the output feature tensor to be passed into heads. This is the main target of backpropagation.
  • activations (dict[str, Tensor]): the hidden features (after activation) in each weighted layer. Keys (str) are the weighted layer names and values (Tensor) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
class CLResNet152(clarena.backbones.base.CLBackbone, ResNet152):
1048class CLResNet152(CLBackbone, ResNet152):
1049    r"""The ResNet-152 backbone network for continual learning.
1050
1051    This is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 152 weight convolutional layers in total. See Table 1 in the paper for details.
1052    """
1053
1054    def __init__(
1055        self,
1056        input_channels: int,
1057        output_dim: int,
1058        activation_layer: nn.Module | None = nn.ReLU,
1059        batch_normalization: bool = True,
1060        bias: bool = False,
1061        pretrained_weights: str | None = None,
1062        **kwargs,
1063    ) -> None:
1064        r"""Construct and initialize the ResNet-152 backbone network for continual learning.
1065
1066        **Args:**
1067        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
1068        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1069        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1070        - **batch_normalization** (`bool`): whether to use batch normalization after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
1071        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
1072        - **pretrained_weights** (`str`): the name of pretrained weights to be loaded. See [TorchVision docs](https://pytorch.org/vision/main/models.html). If `None`, no pretrained weights are loaded. Default `None`.
1073        - **kwargs**: Reserved for multiple inheritance.
1074        """
1075        super().__init__(
1076            input_channels=input_channels,
1077            output_dim=output_dim,
1078            activation_layer=activation_layer,
1079            batch_normalization=batch_normalization,
1080            bias=bias,
1081            pretrained_weights=pretrained_weights,
1082            **kwargs,
1083        )
1084
1085    def forward(
1086        self, input: Tensor, stage: str = None, task_id: int | None = None
1087    ) -> tuple[Tensor, dict[str, Tensor]]:
1088        r"""The forward pass for data. It is the same for all tasks.
1089
1090        **Args:**
1091        - **input** (`Tensor`): the input tensor from data.
1092        - **stage** (`str` | `None`): Unused. Kept for API compatibility with other backbones.
1093        - **task_id** (`int` | `None`): Unused. Kept for API compatibility with other continual learning backbones.
1094
1095        **Returns:**
1096        - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation.
1097        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
1098        """
1099        return ResNet152.forward(
1100            self, input, stage
1101        )  # call the ResNet152 forward method

The ResNet-152 backbone network for continual learning.

This is a larger architecture proposed in the original ResNet paper. It consists of 152 weight convolutional layers in total. See Table 1 in the paper for details.

CLResNet152( input_channels: int, output_dim: int, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, batch_normalization: bool = True, bias: bool = False, pretrained_weights: str | None = None, **kwargs)
1054    def __init__(
1055        self,
1056        input_channels: int,
1057        output_dim: int,
1058        activation_layer: nn.Module | None = nn.ReLU,
1059        batch_normalization: bool = True,
1060        bias: bool = False,
1061        pretrained_weights: str | None = None,
1062        **kwargs,
1063    ) -> None:
1064        r"""Construct and initialize the ResNet-152 backbone network for continual learning.
1065
1066        **Args:**
1067        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
1068        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1069        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1070        - **batch_normalization** (`bool`): whether to use batch normalization after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
1071        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalization are doing the similar thing with bias.
1072        - **pretrained_weights** (`str`): the name of pretrained weights to be loaded. See [TorchVision docs](https://pytorch.org/vision/main/models.html). If `None`, no pretrained weights are loaded. Default `None`.
1073        - **kwargs**: Reserved for multiple inheritance.
1074        """
1075        super().__init__(
1076            input_channels=input_channels,
1077            output_dim=output_dim,
1078            activation_layer=activation_layer,
1079            batch_normalization=batch_normalization,
1080            bias=bias,
1081            pretrained_weights=pretrained_weights,
1082            **kwargs,
1083        )

Construct and initialize the ResNet-152 backbone network for continual learning.

Args:

  • input_channels (int): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • batch_normalization (bool): whether to use batch normalization after the weight convolutional layers. Default True, same as what the original ResNet paper does.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalization are doing the similar thing with bias.
  • pretrained_weights (str): the name of pretrained weights to be loaded. See TorchVision docs. If None, no pretrained weights are loaded. Default None.
  • kwargs: Reserved for multiple inheritance.
def forward( self, input: torch.Tensor, stage: str = None, task_id: int | None = None) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
1085    def forward(
1086        self, input: Tensor, stage: str = None, task_id: int | None = None
1087    ) -> tuple[Tensor, dict[str, Tensor]]:
1088        r"""The forward pass for data. It is the same for all tasks.
1089
1090        **Args:**
1091        - **input** (`Tensor`): the input tensor from data.
1092        - **stage** (`str` | `None`): Unused. Kept for API compatibility with other backbones.
1093        - **task_id** (`int` | `None`): Unused. Kept for API compatibility with other continual learning backbones.
1094
1095        **Returns:**
1096        - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation.
1097        - **activations** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Keys (`str`) are the weighted layer names and values (`Tensor`) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
1098        """
1099        return ResNet152.forward(
1100            self, input, stage
1101        )  # call the ResNet152 forward method

The forward pass for data. It is the same for all tasks.

Args:

  • input (Tensor): the input tensor from data.
  • stage (str | None): Unused. Kept for API compatibility with other backbones.
  • task_id (int | None): Unused. Kept for API compatibility with other continual learning backbones.

Returns:

  • output_feature (Tensor): the output feature tensor to be passed into heads. This is the main target of backpropagation.
  • activations (dict[str, Tensor]): the hidden features (after activation) in each weighted layer. Keys (str) are the weighted layer names and values (Tensor) are the hidden feature tensors. This is used for the continual learning algorithms that need to use the hidden features for various purposes.