clarena.backbones.resnet

The submodule in backbones for ResNet backbone network.

   1r"""
   2The submodule in `backbones` for ResNet backbone network.
   3"""
   4
   5__all__ = [
   6    "ResNetBlockSmall",
   7    "ResNetBlockLarge",
   8    "ResNetBase",
   9    "ResNet18",
  10    "ResNet34",
  11    "ResNet50",
  12    "ResNet101",
  13    "ResNet152",
  14    "HATMaskResNetBlockSmall",
  15    "HATMaskResNetBlockLarge",
  16    "HATMaskResNetBase",
  17    "HATMaskResNet18",
  18    "HATMaskResNet34",
  19    "HATMaskResNet50",
  20    "HATMaskResNet101",
  21    "HATMaskResNet152",
  22]
  23
  24
  25from torch import Tensor, nn
  26
  27from clarena.backbones import CLBackbone, HATMaskBackbone
  28
  29
  30class ResNetBlockSmall(CLBackbone):
  31    r"""The smaller building block for ResNet-18/34.
  32
  33    It consists of 2 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (left) in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html).
  34    """
  35
  36    def __init__(
  37        self,
  38        outer_layer_name: str,
  39        block_idx: int,
  40        preceding_output_channels: int,
  41        input_channels: int,
  42        overall_stride: int,
  43        activation_layer: nn.Module | None = nn.ReLU,
  44        batch_normalisation: bool = True,
  45        bias: bool = False,
  46    ) -> None:
  47        r"""Construct and initialise the smaller building block.
  48
  49        **Args:**
  50        - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
  51        - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
  52        - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block.
  53        - **input_channels** (`int`): the number of channels of input of this building block.
  54        - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (last) convolutional layer where the 1st convolutional layer remain stride of 1.
  55        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
  56        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
  57        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
  58        """
  59        CLBackbone.__init__(self, output_dim=None)
  60
  61        self.batch_normalisation: bool = batch_normalisation
  62        r"""Store whether to use batch normalisation after the fully-connected layers."""
  63        self.activation: bool = activation_layer is not None
  64        r"""Store whether to use activation function after the fully-connected layers."""
  65
  66        self.full_1st_layer_name = f"{outer_layer_name}/{block_idx}/conv1"
  67        r"""Format and store full name of the 1st weighted convolutional layer. """
  68        self.full_2nd_layer_name = f"{outer_layer_name}/{block_idx}/conv2"
  69        r"""Format and store full name of the 2nd weighted convolutional layer. """
  70
  71        # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc)
  72        layer_input_channels = preceding_output_channels  # the input channels of the 1st convolutional layer, which receive the output channels of the preceding module
  73        layer_output_channels = (
  74            input_channels  # the output channels of the 1st convolutional layer
  75        )
  76        self.conv1 = nn.Conv2d(
  77            in_channels=layer_input_channels,
  78            out_channels=layer_output_channels,
  79            kernel_size=3,
  80            stride=1,
  81            padding=1,
  82            bias=bias,
  83        )  # construct the 1st weight convolutional layer of the smaller building block. Overall stride is not performed here
  84        r"""The 1st weight convolutional layer of the smaller building block. """
  85        self.weighted_layer_names.append(
  86            self.full_1st_layer_name
  87        )  # update the weighted layer names
  88        if self.batch_normalisation:
  89            self.conv_bn1 = nn.BatchNorm2d(
  90                num_features=layer_output_channels
  91            )  # construct the batch normalisation layer
  92            r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """
  93        if self.activation:
  94            self.conv_activation1 = activation_layer()  # construct the activation layer
  95            r"""The activation layer after the 1st weighted convolutional layer. """
  96
  97        # construct the 2nd weighted convolutional layer and attached layers (batchnorm, activation, etc)
  98        layer_input_channels = input_channels  # the input channels of the 2nd convolutional layer, which is `input_channels`, the same as the output channels of the 1st convolutional layer
  99        layer_output_channels = (
 100            input_channels * 1
 101        )  # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion)
 102        self.conv2 = nn.Conv2d(
 103            in_channels=layer_input_channels,
 104            out_channels=layer_output_channels,
 105            kernel_size=3,
 106            stride=overall_stride,
 107            padding=1,
 108            bias=bias,
 109        )  # construct the 2nd weight convolutional layer of the smaller building block. Overall stride is performed here
 110        r"""The 2nd weight convolutional layer of the smaller building block. """
 111        self.weighted_layer_names.append(
 112            self.full_2nd_layer_name
 113        )  # update the weighted layer names
 114        if batch_normalisation:
 115            self.conv_bn2 = nn.BatchNorm2d(
 116                num_features=layer_output_channels
 117            )  # construct the batch normalisation layer
 118            r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 2nd weighted convolutional layer. """
 119        if self.activation:
 120            self.conv_activation2 = activation_layer()  # construct the activation layer
 121            r"""The activation layer after the 2nd weighted convolutional layer. """
 122
 123        self.identity_downsample: nn.Module = (
 124            nn.Conv2d(
 125                in_channels=preceding_output_channels,
 126                out_channels=input_channels,
 127                kernel_size=1,
 128                stride=overall_stride,
 129                bias=False,
 130            )
 131            if preceding_output_channels != input_channels or overall_stride != 1
 132            else None
 133        )  # construct the identity downsample function
 134        r"""The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists. """
 135
 136    def forward(self, input: Tensor) -> tuple[Tensor, dict[str, Tensor]]:
 137        r"""The forward pass for data.
 138
 139        **Args:**
 140        - **input** (`Tensor`): the input feature maps.
 141
 142        **Returns:**
 143        - **output_feature** (`Tensor`): the output feature maps.
 144        - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
 145        """
 146        hidden_features = {}
 147
 148        identity = (
 149            self.identity_downsample(input)
 150            if self.identity_downsample is not None
 151            else input
 152        )  # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's
 153
 154        x = input
 155        x = self.conv1(x)
 156        if self.batch_normalisation:
 157            x = self.conv_bn1(x)
 158        if self.activation:
 159            x = self.conv_activation1(x)
 160        hidden_features[self.full_1st_layer_name] = x  # store the hidden feature
 161
 162        x = self.conv2(x)
 163        if self.batch_normalisation:
 164            x = self.conv_bn2(x)
 165
 166        x = x + identity
 167        if self.activation:
 168            x = self.conv_activation2(x)  # activation after the shortcut connection
 169        hidden_features[self.full_2nd_layer_name] = x  # store the hidden feature
 170
 171        output_feature = x
 172
 173        return output_feature, hidden_features
 174
 175
 176class ResNetBlockLarge(CLBackbone):
 177    r"""The larger building block for ResNet-50/101/152. It is referred to "bottleneck" building block in the paper.
 178
 179    It consists of 3 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (right) in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html).
 180    """
 181
 182    def __init__(
 183        self,
 184        outer_layer_name: str,
 185        block_idx: int,
 186        preceding_output_channels: int,
 187        input_channels: int,
 188        overall_stride: int,
 189        activation_layer: nn.Module | None = nn.ReLU,
 190        batch_normalisation: bool = True,
 191        bias: bool = False,
 192    ) -> None:
 193        r"""Construct and initialise the larger building block.
 194
 195        **Args:**
 196        - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
 197        - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
 198        - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block.
 199        - **input_channels** (`int`): the number of channels of input of this building block.
 200        - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (middle) convolutional layer where 1st and 3rd convolutional layers remain stride of 1.
 201        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 202        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 203        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
 204        """
 205        CLBackbone.__init__(self, output_dim=None)
 206
 207        self.batch_normalisation: bool = batch_normalisation
 208        r"""Store whether to use batch normalisation after the fully-connected layers."""
 209        self.activation: bool = activation_layer is not None
 210        r"""Store whether to use activation function after the fully-connected layers."""
 211
 212        self.full_1st_layer_name = f"{outer_layer_name}/{block_idx}/conv1"
 213        r"""Format and store full name of the 1st weighted convolutional layer. """
 214        self.full_2nd_layer_name = f"{outer_layer_name}_{block_idx}_conv2"
 215        r"""Format and store full name of the 2nd weighted convolutional layer. """
 216        self.full_3rd_layer_name = f"{outer_layer_name}_{block_idx}_conv3"
 217        r"""Format and store full name of the 3rd weighted convolutional layer. """
 218
 219        # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc)
 220        layer_input_channels = preceding_output_channels  # the input channels of the 1st convolutional layer, which receive the output channels of the preceding module
 221        layer_output_channels = (
 222            input_channels  # the output channels of the 1st convolutional layer
 223        )
 224        self.conv1 = nn.Conv2d(
 225            in_channels=layer_input_channels,
 226            out_channels=layer_output_channels,
 227            kernel_size=1,
 228            stride=1,
 229            padding=0,
 230            bias=bias,
 231        )  # construct the 1st weight convolutional layer of the larger building block. Overall stride is not performed here
 232        r"""The 1st weight convolutional layer of the larger building block. """
 233        self.weighted_layer_names.append(
 234            self.full_1st_layer_name
 235        )  # update the weighted layer names
 236        if self.batch_normalisation:
 237            self.conv_bn1 = nn.BatchNorm2d(
 238                num_features=layer_output_channels
 239            )  # construct the batch normalisation layer
 240            r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """
 241        if self.activation:
 242            self.conv_activation1 = activation_layer()  # construct the activation layer
 243            r"""The activation layer after the 1st weighted convolutional layer. """
 244
 245        # construct the 2nd weighted convolutional layer and attached layers (batchnorm, activation, etc)
 246        layer_input_channels = input_channels  # the input channels of the 2nd convolutional layer, which is `input_channels`, the same as the output channels of the 1st convolutional layer
 247        layer_output_channels = (
 248            input_channels
 249            * 1  # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion)
 250        )
 251        self.conv2 = nn.Conv2d(
 252            in_channels=layer_input_channels,
 253            out_channels=layer_output_channels,
 254            kernel_size=3,
 255            stride=overall_stride,
 256            padding=1,
 257            bias=bias,
 258        )  # construct the 2nd weight convolutional layer of the larger building block. Overall stride is performed here
 259        r"""The 2nd weight convolutional layer of the larger building block. """
 260        self.weighted_layer_names.append(
 261            self.full_2nd_layer_name
 262        )  # update the weighted layer names
 263        if self.batch_normalisation:
 264            self.conv_bn2 = nn.BatchNorm2d(
 265                num_features=layer_output_channels
 266            )  # construct the batch normalisation layer
 267            r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 2nd weighted convolutional layer. """
 268        if self.activation:
 269            self.conv_activation2 = activation_layer()  # construct the activation layer
 270            r"""The activation layer after the 2nd weighted convolutional layer. """
 271
 272        # construct the 3rd weighted convolutional layer and attached layers (batchnorm, activation, etc)
 273        layer_input_channels = (
 274            input_channels * 1
 275        )  # the input channels of the 2nd convolutional layer, which is `input_channels * 1`, the same as the output channels of the 1st convolutional layer
 276        layer_output_channels = (
 277            input_channels
 278            * 4  # the output channels of the 2nd convolutional layer, which is 4 times expanded as the input channels
 279        )
 280        self.conv3 = nn.Conv2d(
 281            in_channels=layer_input_channels,
 282            out_channels=layer_output_channels,
 283            kernel_size=1,
 284            stride=1,
 285            padding=0,
 286            bias=bias,
 287        )  # construct the 3rd weight convolutional layer of the larger building block. Overall stride is not performed here
 288        r"""The 3rd weight convolutional layer of the larger building block. """
 289        self.weighted_layer_names.append(
 290            self.full_3rd_layer_name
 291        )  # update the weighted layer names
 292        if batch_normalisation:
 293            self.conv_bn3 = nn.BatchNorm2d(
 294                num_features=layer_output_channels
 295            )  # construct the batch normalisation layer
 296            r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 3rd weighted convolutional layer. """
 297        if self.activation:
 298            self.conv_activation3 = activation_layer()  # construct the activation layer
 299            r"""The activation layer after the 3rd weighted convolutional layer. """
 300
 301        self.identity_downsample: nn.Module = (
 302            nn.Conv2d(
 303                in_channels=preceding_output_channels,
 304                out_channels=input_channels * 4,
 305                kernel_size=1,
 306                stride=overall_stride,
 307                bias=False,
 308            )
 309            if preceding_output_channels != input_channels * 4 or overall_stride != 1
 310            else None
 311        )
 312        r"""The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists. """
 313
 314    def forward(self, input: Tensor) -> tuple[Tensor, dict[str, Tensor]]:
 315        r"""The forward pass for data.
 316
 317        **Args:**
 318        - **input** (`Tensor`): the input feature maps.
 319
 320        **Returns:**
 321        - **output_feature** (`Tensor`): the output feature maps.
 322        - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
 323        """
 324        hidden_features = {}
 325
 326        identity = (
 327            self.identity_downsample(input)
 328            if self.identity_downsample is not None
 329            else input
 330        )  # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's
 331
 332        x = input
 333        x = self.conv1(x)
 334        if self.batch_normalisation:
 335            x = self.conv_bn1(x)
 336        if self.activation:
 337            x = self.conv_activation1(x)
 338        hidden_features[self.full_1st_layer_name] = x  # store the hidden feature
 339
 340        x = self.conv2(x)
 341        if self.batch_normalisation:
 342            x = self.conv_bn2(x)
 343        if self.activation:
 344            x = self.conv_activation2(x)
 345        hidden_features[self.full_2nd_layer_name] = x  # store the hidden feature
 346
 347        x = self.conv3(x)
 348        if self.batch_normalisation:
 349            x = self.conv_bn3(x)
 350
 351        x = x + identity
 352        if self.activation:
 353            x = self.conv_activation3(x)  # activation after the shortcut connection
 354        hidden_features[self.full_3rd_layer_name] = x  # store the hidden feature
 355
 356        output_feature = x
 357
 358        return output_feature, hidden_features
 359
 360
 361class ResNetBase(CLBackbone):
 362    r"""The base class of [residual network (ResNet)](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html).
 363
 364    ResNet is a convolutional network architecture, which has 1st convolutional parameter layer and a maxpooling layer, connecting to 4 convolutional layers which contains multiple convolutional parameter layer. Each layer of the 4 are constructed from basic building blocks which are either small (`ResNetBlockSmall`) or large (`ResNetBlockLarge`). Each building block contains several convolutional parameter layers. The building blocks are connected by a skip connection which is a direct connection from the input of the block to the output of the block, and this is why it's called residual (find "shortcut connections" in the paper for more details). After the 5th convolutional layer, there are average pooling layer and a fully connected layer which connects to the CL output heads.
 365    """
 366
 367    def __init__(
 368        self,
 369        input_channels: int,
 370        building_block_type: ResNetBlockSmall | ResNetBlockLarge,
 371        building_block_nums: tuple[int, int, int, int],
 372        building_block_preceding_output_channels: tuple[int, int, int, int],
 373        building_block_input_channels: tuple[int, int, int, int],
 374        output_dim: int,
 375        activation_layer: nn.Module | None = nn.ReLU,
 376        batch_normalisation: bool = True,
 377        bias: bool = False,
 378    ) -> None:
 379        r"""Construct and initialise the ResNet backbone network.
 380
 381        **Args:**
 382        - **input_channels** (`int`): the number of channels of input. Image data are kept channels when going in ResNet. Note that convolutional networks require number of input channels instead of dimension.
 383        - **building_block_type** (`ResNetBlockSmall` | `ResNetBlockLarge`): the type of building block used in the ResNet.
 384        - **building_block_nums** (`tuple[int, int, int, int]`): the number of building blocks in the 2-5 convolutional layer correspondingly.
 385        - **building_block_preceding_output_channels** (`tuple[int, int, int, int]`): the number of channels of preceding output of each building block in the 2-5 convolutional layer correspondingly.
 386        - **building_block_input_channels** (`tuple[int, int, int, int]`): the number of channels of input of each building block in the 2-5 convolutional layer correspondingly.
 387        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
 388        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 389        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 390        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
 391        """
 392        CLBackbone.__init__(self, output_dim=output_dim)
 393
 394        self.batch_normalisation: bool = batch_normalisation
 395        r"""Store whether to use batch normalisation after the fully-connected layers."""
 396        self.activation: bool = activation_layer is not None
 397        r"""Store whether to use activation function after the fully-connected layers."""
 398
 399        # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc)
 400        layer_input_channels = input_channels  # the input channels of the 1st convolutional layer, which receive the input of the entire network
 401        layer_output_channels = 64  # the output channels of the 1st convolutional layer
 402        self.conv1 = nn.Conv2d(
 403            in_channels=layer_input_channels,
 404            out_channels=layer_output_channels,
 405            kernel_size=7,
 406            stride=2,
 407            padding=3,
 408            bias=bias,
 409        )  # construct the 1st weight convolutional layer of the entire ResNet
 410        r"""The 1st weight convolutional layer of the entire ResNet. It  is always with fixed kernel size 7x7, stride 2, and padding 3. """
 411        self.weighted_layer_names.append("conv1")  # collect the layer name to be masked
 412        if self.batch_normalisation:
 413            self.conv_bn1 = nn.BatchNorm2d(
 414                num_features=layer_output_channels
 415            )  # construct the batch normalisation layer
 416            r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """
 417        if self.activation:
 418            self.conv_activation1 = activation_layer()  # construct the activation layer
 419            r"""The activation layer after the 1st weighted convolutional layer. """
 420
 421        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)  #
 422        r"""The max pooling layer which is laid in between 1st and 2nd convolutional layers with kernel size 3x3, stride 2. """
 423
 424        # construct the 2nd convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
 425        self.conv2x = self._multiple_blocks(
 426            layer_name="conv2x",
 427            building_block_type=building_block_type,
 428            building_block_num=building_block_nums[0],
 429            preceding_output_channels=building_block_preceding_output_channels[0],
 430            input_channels=building_block_input_channels[0],
 431            overall_stride=1,  # the overall stride of the 2nd convolutional layer should be 1, as the preceding maxpooling layer has stride 2, which already made 112x112 -> 56x56. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
 432            activation_layer=activation_layer,
 433            batch_normalisation=batch_normalisation,
 434            bias=bias,
 435        )
 436        r"""The 2nd convolutional layer of the ResNet, which contains multiple blocks. """
 437
 438        # construct the 3rd convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
 439        self.conv3x = self._multiple_blocks(
 440            layer_name="conv3x",
 441            building_block_type=building_block_type,
 442            building_block_num=building_block_nums[1],
 443            preceding_output_channels=building_block_preceding_output_channels[1],
 444            input_channels=building_block_input_channels[1],
 445            overall_stride=2,  # the overall stride of the 3rd convolutional layer should be 2, making 56x56 -> 28x28. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
 446            activation_layer=activation_layer,
 447            batch_normalisation=batch_normalisation,
 448            bias=bias,
 449        )
 450        r"""The 3rd convolutional layer of the ResNet, which contains multiple blocks. """
 451
 452        # construct the 4th convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
 453        self.conv4x = self._multiple_blocks(
 454            layer_name="conv4x",
 455            building_block_type=building_block_type,
 456            building_block_num=building_block_nums[2],
 457            preceding_output_channels=building_block_preceding_output_channels[2],
 458            input_channels=building_block_input_channels[2],
 459            overall_stride=2,  # the overall stride of the 4th convolutional layer should be 2, making 28x28 -> 14x14. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
 460            activation_layer=activation_layer,
 461            batch_normalisation=batch_normalisation,
 462            bias=bias,
 463        )
 464        r"""The 4th convolutional layer of the ResNet, which contains multiple blocks. """
 465
 466        # construct the 5th convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
 467        self.conv5x = self._multiple_blocks(
 468            layer_name="conv5x",
 469            building_block_type=building_block_type,
 470            building_block_num=building_block_nums[3],
 471            preceding_output_channels=building_block_preceding_output_channels[3],
 472            input_channels=building_block_input_channels[3],
 473            overall_stride=2,  # the overall stride of the 2nd convolutional layer should be 2, making 14x14 -> 7x7. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
 474            activation_layer=activation_layer,
 475            batch_normalisation=batch_normalisation,
 476            bias=bias,
 477        )
 478        r"""The 5th convolutional layer of the ResNet, which contains multiple blocks. """
 479
 480        self.avepool = nn.AdaptiveAvgPool2d(output_size=(1, 1))
 481        r"""The average pooling layer which is laid after the convolutional layers and before feature maps are flattened. """
 482
 483    def _multiple_blocks(
 484        self,
 485        layer_name: str,
 486        building_block_type: ResNetBlockSmall | ResNetBlockLarge,
 487        building_block_num: int,
 488        preceding_output_channels: int,
 489        input_channels: int,
 490        overall_stride: int,
 491        activation_layer: nn.Module | None = nn.ReLU,
 492        batch_normalisation: bool = True,
 493        bias: bool = False,
 494    ) -> nn.Sequential:
 495        r"""Construct a layer consisting of multiple building blocks. It's used to construct the 2-5 convolutional layers of the ResNet.
 496
 497        The "shortcut connections" are performed between the input and output of each building block:
 498        1. If the input and output of the building block have exactly the same dimensions (including number of channels and size), add the input to the output.
 499        2. If the input and output of the building block have different dimensions (including number of channels and size), add the input to the output after a convolutional layer to make the dimensions match.
 500
 501        **Args:**
 502        - **layer_name** (`str`): pass the name of this multi-building-block layer to construct the full name of each weighted convolutional layer.
 503        - **building_block_type** (`ResNetBlockSmall` | `ResNetBlockLarge`): the type of the building block.
 504        - **building_block_num** (`int`): the number of building blocks in this multi-building-block layer.
 505        - **preceding_output_channels** (`int`): the number of channels of preceding output of this entire multi-building-block layer.
 506        - **input_channels** (`int`): the number of channels of input of this multi-building-block layer.
 507        - **overall_stride** (`int`): the overall stride of the building blocks. This stride is performed at the 1st building block where other building blocks remain their own overall stride of 1. Inside that building block, this stride is performed at certain convolutional layer in the building block where other convolutional layers remain stride of 1:
 508            - For `ResNetBlockSmall`, it performs at the 2nd (last) layer.
 509            - For `ResNetBlockLarge`, it performs at the 2nd (middle) layer.
 510        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 511        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 512        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
 513
 514        **Returns:**
 515        - **layer** (`nn.Sequential`): the constructed layer consisting of multiple building blocks.
 516        """
 517
 518        layer = []
 519
 520        for block_idx in range(building_block_num):
 521            layer.append(
 522                building_block_type(
 523                    outer_layer_name=layer_name,
 524                    block_idx=block_idx,
 525                    preceding_output_channels=(
 526                        preceding_output_channels
 527                        if block_idx == 0
 528                        else (
 529                            input_channels
 530                            if building_block_type == ResNetBlockSmall
 531                            else input_channels * 4
 532                        )
 533                    ),  # if it's the 1st block in this multi-building-block layer, it should be the number of channels of the preceding output of this entire multi-building-block layer. Otherwise, it should be the number of channels from last building block where the number of channels is 4 times expanded as the input channels for `ResNetBlockLarge` than `ResNetBlockSmall`.
 534                    input_channels=input_channels,
 535                    overall_stride=(
 536                        overall_stride if block_idx == 0 else 1
 537                    ),  # only perform the overall stride at the 1st block in this multi-building-block layer
 538                    activation_layer=activation_layer,
 539                    batch_normalisation=batch_normalisation,
 540                    bias=bias,
 541                )
 542            )
 543
 544            self.weighted_layer_names += layer[
 545                -1
 546            ].weighted_layer_names  # collect the weighted layer names in the blocks and sync to the weighted layer names list in the outer network
 547
 548        return nn.Sequential(*layer)
 549
 550    def forward(
 551        self, input: Tensor, stage: str = None, task_id: int | None = None
 552    ) -> tuple[Tensor, dict[str, Tensor]]:
 553        r"""The forward pass for data. It is the same for all tasks.
 554
 555        **Args:**
 556        - **input** (`Tensor`): the input tensor from data.
 557
 558        **Returns:**
 559        - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation.
 560        - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
 561        """
 562        batch_size = input.size(0)
 563        hidden_features = {}
 564
 565        x = input
 566
 567        x = self.conv1(x)
 568        if self.batch_normalisation:
 569            x = self.conv_bn1(x)
 570        if self.activation:
 571            x = self.conv_activation1(x)
 572        hidden_features["conv1"] = x
 573
 574        x = self.maxpool(x)
 575
 576        for block in self.conv2x:
 577            x, hidden_features_block = block(x)
 578            hidden_features.update(hidden_features_block)  # store the hidden feature
 579        for block in self.conv3x:
 580            x, hidden_features_block = block(x)
 581            hidden_features.update(hidden_features_block)  # store the hidden feature
 582        for block in self.conv4x:
 583            x, hidden_features_block = block(x)
 584            hidden_features.update(hidden_features_block)  # store the hidden feature
 585        for block in self.conv5x:
 586            x, hidden_features_block = block(x)
 587            hidden_features.update(hidden_features_block)  # store the hidden feature
 588
 589        x = self.avepool(x)
 590
 591        output_feature = x.view(batch_size, -1)  # flatten before going through heads
 592
 593        return output_feature, hidden_features
 594
 595
 596class ResNet18(ResNetBase):
 597    r"""ResNet-18 backbone network.
 598
 599    This is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 18 weight convolutional layers in total. See Table 1 in the paper for details.
 600    """
 601
 602    def __init__(
 603        self,
 604        input_channels: int,
 605        output_dim: int,
 606        activation_layer: nn.Module | None = nn.ReLU,
 607        batch_normalisation: bool = True,
 608        bias: bool = False,
 609    ) -> None:
 610        r"""Construct and initialise the ResNet-18 backbone network.
 611
 612        **Args:**
 613        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
 614        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
 615        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 616        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 617        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
 618        """
 619        ResNetBase.__init__(
 620            self,
 621            input_channels=input_channels,
 622            building_block_type=ResNetBlockSmall,  # use the smaller building block for ResNet-18
 623            building_block_nums=(2, 2, 2, 2),
 624            building_block_preceding_output_channels=(64, 64, 128, 256),
 625            building_block_input_channels=(64, 128, 256, 512),
 626            output_dim=output_dim,
 627            activation_layer=activation_layer,
 628            batch_normalisation=batch_normalisation,
 629            bias=bias,
 630        )
 631
 632
 633class ResNet34(ResNetBase):
 634    r"""ResNet-34 backbone network.
 635
 636    This is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 34 weight convolutional layers in total. See Table 1 in the paper for details.
 637    """
 638
 639    def __init__(
 640        self,
 641        input_channels: int,
 642        output_dim: int,
 643        activation_layer: nn.Module | None = nn.ReLU,
 644        batch_normalisation: bool = True,
 645        bias: bool = False,
 646    ) -> None:
 647        r"""Construct and initialise the ResNet-34 backbone network.
 648
 649        **Args:**
 650        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
 651        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
 652        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 653        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 654        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
 655        """
 656        ResNetBase.__init__(
 657            self,
 658            input_channels=input_channels,
 659            building_block_type=ResNetBlockSmall,  # use the smaller building block for ResNet-34
 660            building_block_nums=(3, 4, 6, 3),
 661            building_block_preceding_output_channels=(64, 64, 128, 256),
 662            building_block_input_channels=(64, 128, 256, 512),
 663            output_dim=output_dim,
 664            activation_layer=activation_layer,
 665            batch_normalisation=batch_normalisation,
 666            bias=bias,
 667        )
 668
 669
 670class ResNet50(ResNetBase):
 671    r"""ResNet-50 backbone network.
 672
 673    This is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 50 weight convolutional layers in total. See Table 1 in the paper for details.
 674    """
 675
 676    def __init__(
 677        self,
 678        input_channels: int,
 679        output_dim: int,
 680        activation_layer: nn.Module | None = nn.ReLU,
 681        batch_normalisation: bool = True,
 682        bias: bool = False,
 683    ) -> None:
 684        r"""Construct and initialise the ResNet-50 backbone network.
 685
 686        **Args:**
 687        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
 688        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
 689        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 690        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 691        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
 692        """
 693        ResNetBase.__init__(
 694            self,
 695            input_channels=input_channels,
 696            building_block_type=ResNetBlockLarge,  # use the larger building block for ResNet-50
 697            building_block_nums=(3, 4, 6, 3),
 698            building_block_preceding_output_channels=(64, 256, 512, 1024),
 699            building_block_input_channels=(64, 128, 256, 512),
 700            output_dim=output_dim,
 701            activation_layer=activation_layer,
 702            batch_normalisation=batch_normalisation,
 703            bias=bias,
 704        )
 705
 706
 707class ResNet101(ResNetBase):
 708    r"""ResNet-101 backbone network.
 709
 710    This is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 101 weight convolutional layers in total. See Table 1 in the paper for details.
 711    """
 712
 713    def __init__(
 714        self,
 715        input_channels: int,
 716        output_dim: int,
 717        activation_layer: nn.Module | None = nn.ReLU,
 718        batch_normalisation: bool = True,
 719        bias: bool = False,
 720    ) -> None:
 721        r"""Construct and initialise the ResNet-101 backbone network.
 722
 723        **Args:**
 724        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
 725        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
 726        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 727        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 728        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
 729        """
 730        ResNetBase.__init__(
 731            self,
 732            input_channels=input_channels,
 733            building_block_type=ResNetBlockLarge,  # use the larger building block for ResNet-101
 734            building_block_nums=(3, 4, 23, 3),
 735            building_block_preceding_output_channels=(64, 256, 512, 1024),
 736            building_block_input_channels=(64, 128, 256, 512),
 737            output_dim=output_dim,
 738            activation_layer=activation_layer,
 739            batch_normalisation=batch_normalisation,
 740            bias=bias,
 741        )
 742
 743
 744class ResNet152(ResNetBase):
 745    r"""ResNet-152 backbone network.
 746
 747    This is the largest architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 152 weight convolutional layers in total. See Table 1 in the paper for details.
 748    """
 749
 750    def __init__(
 751        self,
 752        input_channels: int,
 753        output_dim: int,
 754        activation_layer: nn.Module | None = nn.ReLU,
 755        batch_normalisation: bool = True,
 756        bias: bool = False,
 757    ) -> None:
 758        r"""Construct and initialise the ResNet-50 backbone network.
 759
 760        **Args:**
 761        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
 762        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
 763        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 764        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 765        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
 766        """
 767        ResNetBase.__init__(
 768            self,
 769            input_channels=input_channels,
 770            building_block_type=ResNetBlockLarge,  # use the larger building block for ResNet-152
 771            building_block_nums=(3, 8, 36, 3),
 772            building_block_preceding_output_channels=(64, 256, 512, 1024),
 773            building_block_input_channels=(64, 128, 256, 512),
 774            output_dim=output_dim,
 775            activation_layer=activation_layer,
 776            batch_normalisation=batch_normalisation,
 777            bias=bias,
 778        )
 779
 780
 781class HATMaskResNetBlockSmall(HATMaskBackbone, ResNetBlockSmall):
 782    r"""The smaller building block for HAT masked ResNet-18/34.
 783
 784    It consists of 2 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (left) in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html).
 785
 786    Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
 787    """
 788
 789    def __init__(
 790        self,
 791        outer_layer_name: str,
 792        block_idx: int,
 793        preceding_output_channels: int,
 794        input_channels: int,
 795        overall_stride: int,
 796        gate: str,
 797        activation_layer: nn.Module | None = nn.ReLU,
 798        bias: bool = False,
 799    ) -> None:
 800        r"""Construct and initialise the smaller building block with task embedding.
 801
 802        **Args:**
 803        - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
 804        - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
 805        - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block.
 806        - **input_channels** (`int`): the number of channels of input of this building block.
 807        - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (last) convolutional layer where the 1st convolutional layer remain stride of 1.
 808        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
 809            - `sigmoid`: the sigmoid function.
 810        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 811        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
 812        """
 813        HATMaskBackbone.__init__(self, output_dim=None, gate=gate)
 814        ResNetBlockSmall.__init__(
 815            self,
 816            outer_layer_name=outer_layer_name,
 817            block_idx=block_idx,
 818            preceding_output_channels=preceding_output_channels,
 819            input_channels=input_channels,
 820            overall_stride=overall_stride,
 821            activation_layer=activation_layer,
 822            bias=bias,
 823        )
 824        self.register_hat_mask_module_explicitly(gate=gate)
 825
 826        # construct the task embedding over the 1st weighted convolutional layer. It is channel-wise
 827        layer_output_channels = (
 828            input_channels  # the output channels of the 1st convolutional layer
 829        )
 830        self.task_embedding_t[self.full_1st_layer_name] = nn.Embedding(
 831            num_embeddings=1, embedding_dim=layer_output_channels
 832        )
 833
 834        # construct the task embedding over the 2nd weighted convolutional layer. It is channel-wise
 835        layer_output_channels = (
 836            input_channels * 1
 837        )  # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion)
 838        self.task_embedding_t[self.full_2nd_layer_name] = nn.Embedding(
 839            num_embeddings=1, embedding_dim=layer_output_channels
 840        )
 841
 842    def forward(
 843        self,
 844        input: Tensor,
 845        stage: str,
 846        s_max: float | None = None,
 847        batch_idx: int | None = None,
 848        num_batches: int | None = None,
 849        test_mask: dict[str, Tensor] | None = None,
 850    ) -> tuple[Tensor, dict[str, Tensor], dict[str, Tensor]]:
 851        r"""The forward pass for data from task `task_id`. Task-specific mask for `task_id` are applied to the units which are channels in each weighted convolutional layer.
 852
 853        **Args:**
 854        - **input** (`Tensor`): The input tensor from data.
 855        - **stage** (`str`): the stage of the forward pass, should be one of the following:
 856            1. 'train': training stage.
 857            2. 'validation': validation stage.
 858            3. 'test': testing stage.
 859        - **s_max** (`float` | `None`): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in [HAT paper](http://proceedings.mlr.press/v80/serra18a).
 860        - **batch_idx** (`int` | `None`): the current batch index. Applies only to training stage. For other stages, it is default `None`.
 861        - **num_batches** (`int` | `None`): the total number of batches. Applies only to training stage. For other stages, it is default `None`.
 862        - **test_mask** (`dict[str, Tensor]` | `None`): the binary mask used for test. Applies only to testing stage. For other stages, it is default `None`.
 863
 864        **Returns:**
 865        - **output_feature** (`Tensor`): the output feature maps.
 866        - **mask** (`dict[str, Tensor]`): the mask for the current task. Key (`str`) is layer name, value (`Tensor`) is the mask tensor. The mask tensor has size (number of units).
 867        - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited this `forward()` method of `HAT` class.
 868        """
 869        hidden_features = {}
 870
 871        # get the mask for the current task from the task embedding in this stage
 872        mask = self.get_mask(
 873            stage=stage,
 874            s_max=s_max,
 875            batch_idx=batch_idx,
 876            num_batches=num_batches,
 877            test_mask=test_mask,
 878        )
 879
 880        identity = (
 881            self.identity_downsample(input)
 882            if self.identity_downsample is not None
 883            else input
 884        )  # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's
 885
 886        x = input
 887        x = self.conv1(x)  # weighted convolutional layer first
 888        x = x * (
 889            mask[self.full_1st_layer_name].view(1, -1, 1, 1)
 890        )  # apply the mask to the 1st convolutional layer. Broadcast the dimension of mask to match the input
 891        if self.activation:
 892            x = self.conv_activation1(x)  # activation function third
 893        hidden_features[self.full_1st_layer_name] = x  # store the hidden feature
 894
 895        x = self.conv2(x)  # weighted convolutional layer first
 896        x = x + identity
 897        x = x * (
 898            mask[self.full_2nd_layer_name].view(1, -1, 1, 1)
 899        )  # apply the mask to the 2nd convolutional layer after the shortcut connection. Broadcast the dimension of mask to match the input
 900        if self.activation:
 901            x = self.conv_activation2(x)  # activation after the shortcut connection
 902        hidden_features[self.full_2nd_layer_name] = x  # store the hidden feature
 903
 904        output_feature = x
 905
 906        return output_feature, mask, hidden_features
 907
 908
 909class HATMaskResNetBlockLarge(HATMaskBackbone, ResNetBlockLarge):
 910    r"""The larger building block for ResNet-50/101/152. It is referred to "bottleneck" building block in the ResNet paper.
 911
 912    It consists of 3 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (right) in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html).
 913
 914    Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
 915    """
 916
 917    def __init__(
 918        self,
 919        outer_layer_name: str,
 920        block_idx: int,
 921        preceding_output_channels: int,
 922        input_channels: int,
 923        overall_stride: int,
 924        gate: str,
 925        activation_layer: nn.Module | None = nn.ReLU,
 926        bias: bool = False,
 927    ) -> None:
 928        r"""Construct and initialise the larger building block with task embedding.
 929
 930        **Args:**
 931        - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
 932        - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
 933        - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block.
 934        - **input_channels** (`int`): the number of channels of input of this building block.
 935        - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (middle) convolutional layer where 1st and 3rd convolutional layers remain stride of 1.
 936        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
 937            - `sigmoid`: the sigmoid function.
 938        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 939        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
 940        """
 941        HATMaskBackbone.__init__(self, output_dim=None, gate=gate)
 942        ResNetBlockLarge.__init__(
 943            self,
 944            outer_layer_name=outer_layer_name,
 945            block_idx=block_idx,
 946            preceding_output_channels=preceding_output_channels,
 947            input_channels=input_channels,
 948            overall_stride=overall_stride,
 949            activation_layer=activation_layer,
 950            bias=bias,
 951        )
 952        self.register_hat_mask_module_explicitly(gate=gate)
 953
 954        # construct the task embedding over the 1st weighted convolutional layer. It is channel-wise
 955        layer_output_channels = (
 956            input_channels  # the output channels of the 1st convolutional layer
 957        )
 958        self.task_embedding_t[self.full_1st_layer_name] = nn.Embedding(
 959            num_embeddings=1, embedding_dim=layer_output_channels
 960        )
 961
 962        # construct the task embedding over the 2nd weighted convolutional layer. It is channel-wise
 963        layer_output_channels = (
 964            input_channels * 1
 965        )  # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion)
 966        self.task_embedding_t[self.full_2nd_layer_name] = nn.Embedding(
 967            num_embeddings=1, embedding_dim=layer_output_channels
 968        )
 969
 970        # construct the task embedding over the 3rd weighted convolutional layer. It is channel-wise
 971        layer_output_channels = (
 972            input_channels
 973            * 4  # the output channels of the 2nd convolutional layer, which is 4 times expanded as the input channels
 974        )
 975        self.task_embedding_t[self.full_3rd_layer_name] = nn.Embedding(
 976            num_embeddings=1, embedding_dim=layer_output_channels
 977        )
 978
 979    def forward(
 980        self,
 981        input: Tensor,
 982        stage: str,
 983        s_max: float | None = None,
 984        batch_idx: int | None = None,
 985        num_batches: int | None = None,
 986        test_mask: dict[str, Tensor] | None = None,
 987    ) -> tuple[Tensor, dict[str, Tensor], dict[str, Tensor]]:
 988        r"""The forward pass for data from task `task_id`. Task-specific mask for `task_id` are applied to the units which are channels in each weighted convolutional layer.
 989
 990        **Args:**
 991        - **input** (`Tensor`): The input tensor from data.
 992        - **stage** (`str`): the stage of the forward pass, should be one of the following:
 993            1. 'train': training stage.
 994            2. 'validation': validation stage.
 995            3. 'test': testing stage.
 996        - **s_max** (`float` | `None`): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in [HAT paper](http://proceedings.mlr.press/v80/serra18a).
 997        - **batch_idx** (`int` | `None`): the current batch index. Applies only to training stage. For other stages, it is default `None`.
 998        - **num_batches** (`int` | `None`): the total number of batches. Applies only to training stage. For other stages, it is default `None`.
 999        - **test_mask** (`dict[str, Tensor]` | `None`): the binary mask used for test. Applies only to testing stage. For other stages, it is default `None`.
1000
1001        **Returns:**
1002        - **output_feature** (`Tensor`): the output feature maps.
1003        - **mask** (`dict[str, Tensor]`): the mask for the current task. Key (`str`) is layer name, value (`Tensor`) is the mask tensor. The mask tensor has size (number of units).
1004        - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited this `forward()` method of `HAT` class.
1005        """
1006        hidden_features = {}
1007
1008        # get the mask for the current task from the task embedding in this stage
1009        mask = self.get_mask(
1010            stage=stage,
1011            s_max=s_max,
1012            batch_idx=batch_idx,
1013            num_batches=num_batches,
1014            test_mask=test_mask,
1015        )
1016
1017        identity = (
1018            self.identity_downsample(input)
1019            if self.identity_downsample is not None
1020            else input
1021        )  # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's
1022
1023        x = input
1024        x = self.conv1(x)  # weighted convolutional layer first
1025        x = x * (
1026            mask[self.full_1st_layer_name].view(1, -1, 1, 1)
1027        )  # apply the mask to the 1st convolutional layer. Broadcast the dimension of mask to match the input
1028        if self.activation:
1029            x = self.conv_activation1(x)  # activation function third
1030        hidden_features[self.full_1st_layer_name] = x  # store the hidden feature
1031
1032        x = self.conv2(x)  # weighted convolutional layer first
1033        x = x * (
1034            mask[self.full_2nd_layer_name].view(1, -1, 1, 1)
1035        )  # apply the mask to the 2nd convolutional layer. Broadcast the dimension of mask to match the input
1036        if self.activation:
1037            x = self.conv_activation2(x)  # activation function third
1038        hidden_features[self.full_2nd_layer_name] = x  # store the hidden feature
1039
1040        x = self.conv3(x)  # weighted convolutional layer first
1041        x = x + identity
1042        x = x * (
1043            mask[self.full_3rd_layer_name].view(1, -1, 1, 1)
1044        )  # apply the mask to the 3rd convolutional layer after the shortcut connection. Broadcast the dimension of mask to match the input
1045        if self.activation:
1046            x = self.activation3(x)  # activation after the shortcut connection
1047        hidden_features[self.full_3rd_layer_name] = x  # store the hidden feature
1048
1049        output_feature = x
1050
1051        return output_feature, mask, hidden_features
1052
1053
1054class HATMaskResNetBase(ResNetBase, HATMaskBackbone):
1055    r"""The base class of HAT masked [residual network (ResNet)](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html).
1056
1057    [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.
1058
1059    ResNet is a convolutional network architecture, which has 1st convolutional parameter layer and a maxpooling layer, connecting to 4 convolutional layers which contains multiple convolutional parameter layer. Each layer of the 4 are constructed from basic building blocks which are either small (`ResNetBlockSmall`) or large (`ResNetBlockLarge`). Each building block contains several convolutional parameter layers. The building blocks are connected by a skip connection which is a direct connection from the input of the block to the output of the block, and this is why it's called residual (find "shortcut connections" in the paper for more details). After the 5th convolutional layer, there are average pooling layer and a fully connected layer which connects to the CL output heads.
1060
1061    Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
1062    """
1063
1064    def __init__(
1065        self,
1066        input_channels: int,
1067        building_block_type: HATMaskResNetBlockSmall | HATMaskResNetBlockLarge,
1068        building_block_nums: tuple[int, int, int, int],
1069        building_block_preceding_output_channels: tuple[int, int, int, int],
1070        building_block_input_channels: tuple[int, int, int, int],
1071        output_dim: int,
1072        gate: str,
1073        activation_layer: nn.Module | None = nn.ReLU,
1074        bias: bool = False,
1075    ) -> None:
1076        r"""Construct and initialise the HAT masked ResNet backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
1077
1078        **Args:**
1079        - **input_channels** (`int`): the number of channels of input. Image data are kept channels when going in ResNet. Note that convolutional networks require number of input channels instead of dimension.
1080        - **building_block_type** (`HATMaskResNetBlockSmall` | `HATMaskResNetBlockLarge`): the type of building block used in the ResNet.
1081        - **building_block_nums** (`tuple[int, int, int, int]`): the number of building blocks in the 2-5 convolutional layer correspondingly.
1082        - **building_block_preceding_output_channels** (`tuple[int, int, int, int]`): the number of channels of preceding output of each building block in the 2-5 convolutional layer correspondingly.
1083        - **building_block_input_channels** (`tuple[int, int, int, int]`): the number of channels of input of each building block in the 2-5 convolutional layer correspondingly.
1084        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1085        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
1086            - `sigmoid`: the sigmoid function.
1087        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1088        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
1089        """
1090        # init from both inherited classes
1091        HATMaskBackbone.__init__(self, output_dim=output_dim, gate=gate)
1092        ResNetBase.__init__(
1093            self,
1094            input_channels=input_channels,
1095            building_block_type=building_block_type,
1096            building_block_nums=building_block_nums,
1097            building_block_preceding_output_channels=building_block_preceding_output_channels,
1098            building_block_input_channels=building_block_input_channels,
1099            output_dim=output_dim,
1100            activation_layer=activation_layer,
1101            batch_normalisation=False,  # batch normalisation is incompatible with HAT mechanism
1102            bias=bias,
1103        )
1104        self.register_hat_mask_module_explicitly(
1105            gate=gate
1106        )  # register all `nn.Module`s for HATMaskBackbone explicitly because the second `__init__()` wipes out them inited by the first `__init__()`
1107        self.update_multiple_blocks_task_embedding()
1108
1109        # construct the task embedding over the 1st weighted convolutional layers. It is channel-wise
1110        layer_output_channels = 64  # the output channels of the 1st convolutional layer
1111        self.task_embedding_t["conv1"] = nn.Embedding(
1112            num_embeddings=1, embedding_dim=layer_output_channels
1113        )
1114
1115    def _multiple_blocks(
1116        self,
1117        layer_name: str,
1118        building_block_type: HATMaskResNetBlockSmall | HATMaskResNetBlockLarge,
1119        building_block_num: int,
1120        preceding_output_channels: int,
1121        input_channels: int,
1122        overall_stride: int,
1123        activation_layer: nn.Module | None = nn.ReLU,
1124        batch_normalisation: bool = False,
1125        bias: bool = False,
1126    ) -> None:
1127        r"""Construct a layer consisting of multiple building blocks with task embedding. It's used to construct the 2-5 convolutional layers of the HAT masked ResNet.
1128
1129        The "shortcut connections" are performed between the input and output of each building block:
1130        1. If the input and output of the building block have exactly the same dimensions (including number of channels and size), add the input to the output.
1131        2. If the input and output of the building block have different dimensions (including number of channels and size), add the input to the output after a convolutional layer to make the dimensions match.
1132
1133        **Args:**
1134        - **layer_name** (`str`): pass the name of this multi-building-block layer to construct the full name of each weighted convolutional layer.
1135        - **building_block_type** (`HATMaskResNetBlockSmall` | `HATMaskResNetBlockLarge`): the type of the building block.
1136        - **building_block_num** (`int`): the number of building blocks in this multi-building-block layer.
1137        - **preceding_output_channels** (`int`): the number of channels of preceding output of this entire multi-building-block layer.
1138        - **input_channels** (`int`): the number of channels of input of this multi-building-block layer.
1139        - **overall_stride** (`int`): the overall stride of the building blocks. This stride is performed at the 1st building block where other building blocks remain their own overall stride of 1. Inside that building block, this stride is performed at certain convolutional layer in the building block where other convolutional layers remain stride of 1:
1140            - For `ResNetBlockSmall`, it performs at the 2nd (last) layer.
1141            - For `ResNetBlockLarge`, it performs at the 2nd (middle) layer.
1142        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1143        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. In HATMaskResNet, batch normalisation is incompatible with HAT mechanism and shoule be always set `False`. We include this argument for compatibility with the original ResNet API.
1144        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
1145
1146        **Returns:**
1147        - **layer** (`nn.Sequential`): the constructed layer consisting of multiple building blocks.
1148        """
1149
1150        layer = []
1151
1152        for block_idx in range(building_block_num):
1153            layer.append(
1154                building_block_type(
1155                    outer_layer_name=layer_name,
1156                    block_idx=block_idx,
1157                    preceding_output_channels=(
1158                        preceding_output_channels
1159                        if block_idx == 0
1160                        else (
1161                            input_channels
1162                            if building_block_type == HATMaskResNetBlockSmall
1163                            else input_channels * 4
1164                        )
1165                    ),  # if it's the 1st block in this multi-building-block layer, it should be the number of channels of the preceding output of this entire multi-building-block layer. Otherwise, it should be the number of channels from last building block where the number of channels is 4 times of the input channels for `ResNetBlockLarge` than `ResNetBlockSmall`.
1166                    input_channels=input_channels,
1167                    overall_stride=(
1168                        overall_stride if block_idx == 0 else 1
1169                    ),  # only perform the overall stride at the 1st block in this multi-building-block layer
1170                    gate=self.gate,
1171                    # no batch normalisation in HAT masked blocks
1172                    activation_layer=activation_layer,
1173                    bias=bias,
1174                )
1175            )
1176
1177            self.weighted_layer_names += layer[
1178                -1
1179            ].weighted_layer_names  # collect the weighted layer names in the blocks and sync to the weighted layer names list in the outer network
1180
1181        return nn.Sequential(*layer)
1182
1183    def update_multiple_blocks_task_embedding(self) -> None:
1184        r"""Collect the task embeddings in the multiple building blocks (2-5 convolutional layers) and sync to the weighted layer names list in the outer network.
1185
1186        This should only be called explicitly after the `__init__()` method, just because task embedding as `nn.Module` instance was wiped out at the beginning of it.
1187        """
1188        for block in self.conv2x:
1189            self.task_embedding_t.update(block.task_embedding_t)
1190        for block in self.conv3x:
1191            self.task_embedding_t.update(block.task_embedding_t)
1192        for block in self.conv4x:
1193            self.task_embedding_t.update(block.task_embedding_t)
1194        for block in self.conv5x:
1195            self.task_embedding_t.update(block.task_embedding_t)
1196
1197    def forward(
1198        self,
1199        input: Tensor,
1200        stage: str,
1201        s_max: float | None = None,
1202        batch_idx: int | None = None,
1203        num_batches: int | None = None,
1204        test_mask: dict[str, Tensor] | None = None,
1205    ) -> tuple[Tensor, dict[str, Tensor], dict[str, Tensor]]:
1206        r"""The forward pass for data from task `task_id`. Task-specific mask for `task_id` are applied to the units which are channels in each weighted convolutional layer.
1207
1208        **Args:**
1209        - **input** (`Tensor`): the input tensor from data.
1210        - **stage** (`str`): the stage of the forward pass, should be one of the following:
1211            1. 'train': training stage.
1212            2. 'validation': validation stage.
1213            3. 'test': testing stage.
1214        - **s_max** (`float` | `None`): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in [HAT paper](http://proceedings.mlr.press/v80/serra18a).
1215        - **batch_idx** (`int` | `None`): the current batch index. Applies only to training stage. For other stages, it is default `None`.
1216        - **num_batches** (`int` | `None`): the total number of batches. Applies only to training stage. For other stages, it is default `None`.
1217        - **test_mask** (`dict[str, Tensor]` | `None`): the binary mask used for test. Applies only to testing stage. For other stages, it is default `None`.
1218
1219        **Returns:**
1220        - **output_feature** (`Tensor`): the output feature tensor to be passed to the heads.
1221        - **mask** (`dict[str, Tensor]`): the mask for the current task. Key (`str`) is layer name, value (`Tensor`) is the mask tensor. The mask tensor has size (number of units).
1222        - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited this `forward()` method of `HAT` class.
1223        """
1224        batch_size = input.size(0)
1225        hidden_features = {}
1226
1227        # get the mask for the current task from the task embedding in this stage
1228        mask = self.get_mask(
1229            stage=stage,
1230            s_max=s_max,
1231            batch_idx=batch_idx,
1232            num_batches=num_batches,
1233            test_mask=test_mask,
1234        )
1235
1236        x = input
1237
1238        x = self.conv1(x)
1239
1240        x = x * (
1241            mask["conv1"].view(1, -1, 1, 1)
1242        )  # apply the mask to the 1st convolutional layer. Broadcast the dimension of mask to match the input
1243        if self.activation:
1244            x = self.conv_activation1(x)
1245        hidden_features["conv1"] = x
1246
1247        x = self.maxpool(x)
1248
1249        for block in self.conv2x:
1250            x, _, hidden_features_block = block(
1251                x,
1252                stage=stage,
1253                s_max=s_max,
1254                batch_idx=batch_idx,
1255                num_batches=num_batches,
1256                test_mask=test_mask,
1257            )
1258            hidden_features.update(hidden_features_block)  # store the hidden feature
1259        for block in self.conv3x:
1260            x, _, hidden_features_block = block(
1261                x,
1262                stage=stage,
1263                s_max=s_max,
1264                batch_idx=batch_idx,
1265                num_batches=num_batches,
1266                test_mask=test_mask,
1267            )
1268            hidden_features.update(hidden_features_block)  # store the hidden feature
1269        for block in self.conv4x:
1270            x, _, hidden_features_block = block(
1271                x,
1272                stage=stage,
1273                s_max=s_max,
1274                batch_idx=batch_idx,
1275                num_batches=num_batches,
1276                test_mask=test_mask,
1277            )
1278            hidden_features.update(hidden_features_block)  # store the hidden feature
1279        for block in self.conv5x:
1280            x, _, hidden_features_block = block(
1281                x,
1282                stage=stage,
1283                s_max=s_max,
1284                batch_idx=batch_idx,
1285                num_batches=num_batches,
1286                test_mask=test_mask,
1287            )
1288            hidden_features.update(hidden_features_block)  # store the hidden feature
1289
1290        x = self.avepool(x)
1291
1292        output_feature = x.view(batch_size, -1)  # flatten before going through heads
1293
1294        return output_feature, mask, hidden_features
1295
1296
1297class HATMaskResNet18(HATMaskResNetBase):
1298    r"""HAT masked ResNet-18 backbone network.
1299
1300    [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.
1301
1302    ResNet-18 is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 18 weight convolutional layers in total. See Table 1 in the paper for details.
1303
1304    Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
1305    """
1306
1307    def __init__(
1308        self,
1309        input_channels: int,
1310        output_dim: int,
1311        gate: str,
1312        activation_layer: nn.Module | None = nn.ReLU,
1313        bias: bool = False,
1314    ) -> None:
1315        r"""Construct and initialise the ResNet-18 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
1316
1317        **Args:**
1318        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
1319        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1320        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
1321            - `sigmoid`: the sigmoid function.
1322        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1323        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
1324        """
1325        HATMaskResNetBase.__init__(
1326            self,
1327            input_channels=input_channels,
1328            building_block_type=HATMaskResNetBlockSmall,  # use the smaller building block for ResNet-18
1329            building_block_nums=(2, 2, 2, 2),
1330            building_block_preceding_output_channels=(64, 64, 128, 256),
1331            building_block_input_channels=(64, 128, 256, 512),
1332            output_dim=output_dim,
1333            gate=gate,
1334            activation_layer=activation_layer,
1335            bias=bias,
1336        )
1337
1338
1339class HATMaskResNet34(HATMaskResNetBase):
1340    r"""HAT masked ResNet-34 backbone network.
1341
1342    [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.
1343
1344    ResNet-34 is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 34 weight convolutional layers in total. See Table 1 in the paper for details.
1345
1346    Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
1347    """
1348
1349    def __init__(
1350        self,
1351        input_channels: int,
1352        output_dim: int,
1353        gate: str,
1354        activation_layer: nn.Module | None = nn.ReLU,
1355        bias: bool = False,
1356    ) -> None:
1357        r"""Construct and initialise the ResNet-34 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
1358
1359        **Args:**
1360        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
1361        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1362        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
1363            - `sigmoid`: the sigmoid function.
1364        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1365        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
1366        """
1367        HATMaskResNetBase.__init__(
1368            self,
1369            input_channels=input_channels,
1370            building_block_type=HATMaskResNetBlockSmall,  # use the smaller building block for ResNet-34
1371            building_block_nums=(3, 4, 6, 3),
1372            building_block_preceding_output_channels=(64, 64, 128, 256),
1373            building_block_input_channels=(64, 128, 256, 512),
1374            output_dim=output_dim,
1375            gate=gate,
1376            activation_layer=activation_layer,
1377            bias=bias,
1378        )
1379
1380
1381class HATMaskResNet50(HATMaskResNetBase):
1382    r"""HAT masked ResNet-50 backbone network.
1383
1384    [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.
1385
1386    ResNet-50 is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 50 weight convolutional layers in total. See Table 1 in the paper for details.
1387
1388    Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
1389    """
1390
1391    def __init__(
1392        self,
1393        input_channels: int,
1394        output_dim: int,
1395        gate: str,
1396        activation_layer: nn.Module | None = nn.ReLU,
1397        bias: bool = False,
1398    ) -> None:
1399        r"""Construct and initialise the ResNet-50 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
1400
1401        **Args:**
1402        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
1403        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1404        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
1405            - `sigmoid`: the sigmoid function.
1406        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1407        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
1408        """
1409        HATMaskResNetBase.__init__(
1410            self,
1411            input_channels=input_channels,
1412            building_block_type=HATMaskResNetBlockLarge,  # use the smaller building block for ResNet-50
1413            building_block_nums=(3, 4, 6, 3),
1414            building_block_preceding_output_channels=(64, 256, 512, 1024),
1415            building_block_input_channels=(64, 128, 256, 512),
1416            output_dim=output_dim,
1417            gate=gate,
1418            activation_layer=activation_layer,
1419            bias=bias,
1420        )
1421
1422
1423class HATMaskResNet101(HATMaskResNetBase):
1424    r"""HAT masked ResNet-101 backbone network.
1425
1426    [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.
1427
1428    ResNet-101 is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 101 weight convolutional layers in total. See Table 1 in the paper for details.
1429
1430    Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
1431    """
1432
1433    def __init__(
1434        self,
1435        input_channels: int,
1436        output_dim: int,
1437        gate: str,
1438        activation_layer: nn.Module | None = nn.ReLU,
1439        bias: bool = False,
1440    ) -> None:
1441        r"""Construct and initialise the ResNet-101 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
1442
1443        **Args:**
1444        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
1445        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1446        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
1447            - `sigmoid`: the sigmoid function.
1448        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1449        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
1450        """
1451        HATMaskResNetBase.__init__(
1452            self,
1453            input_channels=input_channels,
1454            building_block_type=HATMaskResNetBlockLarge,  # use the smaller building block for ResNet-18
1455            building_block_nums=(3, 4, 23, 3),
1456            building_block_preceding_output_channels=(64, 256, 512, 1024),
1457            building_block_input_channels=(64, 128, 256, 512),
1458            output_dim=output_dim,
1459            gate=gate,
1460            activation_layer=activation_layer,
1461            bias=bias,
1462        )
1463
1464
1465class HATMaskResNet152(HATMaskResNetBase):
1466    r"""HAT masked ResNet-152 backbone network.
1467
1468    [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.
1469
1470    ResNet-152 is the largest architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 152 weight convolutional layers in total. See Table 1 in the paper for details.
1471
1472    Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
1473    """
1474
1475    def __init__(
1476        self,
1477        input_channels: int,
1478        output_dim: int,
1479        gate: str,
1480        activation_layer: nn.Module | None = nn.ReLU,
1481        bias: bool = False,
1482    ) -> None:
1483        r"""Construct and initialise the ResNet-152 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
1484
1485        **Args:**
1486        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
1487        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1488        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
1489            - `sigmoid`: the sigmoid function.
1490        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1491        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
1492        """
1493        HATMaskResNetBase.__init__(
1494            self,
1495            input_channels=input_channels,
1496            building_block_type=HATMaskResNetBlockLarge,  # use the smaller building block for ResNet-152
1497            building_block_nums=(3, 8, 36, 3),
1498            building_block_preceding_output_channels=(64, 256, 512, 1024),
1499            building_block_input_channels=(64, 128, 256, 512),
1500            output_dim=output_dim,
1501            gate=gate,
1502            activation_layer=activation_layer,
1503            bias=bias,
1504        )
class ResNetBlockSmall(clarena.backbones.base.CLBackbone):
 31class ResNetBlockSmall(CLBackbone):
 32    r"""The smaller building block for ResNet-18/34.
 33
 34    It consists of 2 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (left) in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html).
 35    """
 36
 37    def __init__(
 38        self,
 39        outer_layer_name: str,
 40        block_idx: int,
 41        preceding_output_channels: int,
 42        input_channels: int,
 43        overall_stride: int,
 44        activation_layer: nn.Module | None = nn.ReLU,
 45        batch_normalisation: bool = True,
 46        bias: bool = False,
 47    ) -> None:
 48        r"""Construct and initialise the smaller building block.
 49
 50        **Args:**
 51        - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
 52        - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
 53        - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block.
 54        - **input_channels** (`int`): the number of channels of input of this building block.
 55        - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (last) convolutional layer where the 1st convolutional layer remain stride of 1.
 56        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 57        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 58        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
 59        """
 60        CLBackbone.__init__(self, output_dim=None)
 61
 62        self.batch_normalisation: bool = batch_normalisation
 63        r"""Store whether to use batch normalisation after the fully-connected layers."""
 64        self.activation: bool = activation_layer is not None
 65        r"""Store whether to use activation function after the fully-connected layers."""
 66
 67        self.full_1st_layer_name = f"{outer_layer_name}/{block_idx}/conv1"
 68        r"""Format and store full name of the 1st weighted convolutional layer. """
 69        self.full_2nd_layer_name = f"{outer_layer_name}/{block_idx}/conv2"
 70        r"""Format and store full name of the 2nd weighted convolutional layer. """
 71
 72        # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc)
 73        layer_input_channels = preceding_output_channels  # the input channels of the 1st convolutional layer, which receive the output channels of the preceding module
 74        layer_output_channels = (
 75            input_channels  # the output channels of the 1st convolutional layer
 76        )
 77        self.conv1 = nn.Conv2d(
 78            in_channels=layer_input_channels,
 79            out_channels=layer_output_channels,
 80            kernel_size=3,
 81            stride=1,
 82            padding=1,
 83            bias=bias,
 84        )  # construct the 1st weight convolutional layer of the smaller building block. Overall stride is not performed here
 85        r"""The 1st weight convolutional layer of the smaller building block. """
 86        self.weighted_layer_names.append(
 87            self.full_1st_layer_name
 88        )  # update the weighted layer names
 89        if self.batch_normalisation:
 90            self.conv_bn1 = nn.BatchNorm2d(
 91                num_features=layer_output_channels
 92            )  # construct the batch normalisation layer
 93            r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """
 94        if self.activation:
 95            self.conv_activation1 = activation_layer()  # construct the activation layer
 96            r"""The activation layer after the 1st weighted convolutional layer. """
 97
 98        # construct the 2nd weighted convolutional layer and attached layers (batchnorm, activation, etc)
 99        layer_input_channels = input_channels  # the input channels of the 2nd convolutional layer, which is `input_channels`, the same as the output channels of the 1st convolutional layer
100        layer_output_channels = (
101            input_channels * 1
102        )  # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion)
103        self.conv2 = nn.Conv2d(
104            in_channels=layer_input_channels,
105            out_channels=layer_output_channels,
106            kernel_size=3,
107            stride=overall_stride,
108            padding=1,
109            bias=bias,
110        )  # construct the 2nd weight convolutional layer of the smaller building block. Overall stride is performed here
111        r"""The 2nd weight convolutional layer of the smaller building block. """
112        self.weighted_layer_names.append(
113            self.full_2nd_layer_name
114        )  # update the weighted layer names
115        if batch_normalisation:
116            self.conv_bn2 = nn.BatchNorm2d(
117                num_features=layer_output_channels
118            )  # construct the batch normalisation layer
119            r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 2nd weighted convolutional layer. """
120        if self.activation:
121            self.conv_activation2 = activation_layer()  # construct the activation layer
122            r"""The activation layer after the 2nd weighted convolutional layer. """
123
124        self.identity_downsample: nn.Module = (
125            nn.Conv2d(
126                in_channels=preceding_output_channels,
127                out_channels=input_channels,
128                kernel_size=1,
129                stride=overall_stride,
130                bias=False,
131            )
132            if preceding_output_channels != input_channels or overall_stride != 1
133            else None
134        )  # construct the identity downsample function
135        r"""The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists. """
136
137    def forward(self, input: Tensor) -> tuple[Tensor, dict[str, Tensor]]:
138        r"""The forward pass for data.
139
140        **Args:**
141        - **input** (`Tensor`): the input feature maps.
142
143        **Returns:**
144        - **output_feature** (`Tensor`): the output feature maps.
145        - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
146        """
147        hidden_features = {}
148
149        identity = (
150            self.identity_downsample(input)
151            if self.identity_downsample is not None
152            else input
153        )  # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's
154
155        x = input
156        x = self.conv1(x)
157        if self.batch_normalisation:
158            x = self.conv_bn1(x)
159        if self.activation:
160            x = self.conv_activation1(x)
161        hidden_features[self.full_1st_layer_name] = x  # store the hidden feature
162
163        x = self.conv2(x)
164        if self.batch_normalisation:
165            x = self.conv_bn2(x)
166
167        x = x + identity
168        if self.activation:
169            x = self.conv_activation2(x)  # activation after the shortcut connection
170        hidden_features[self.full_2nd_layer_name] = x  # store the hidden feature
171
172        output_feature = x
173
174        return output_feature, hidden_features

The smaller building block for ResNet-18/34.

It consists of 2 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (left) in the original ResNet paper.

ResNetBlockSmall( outer_layer_name: str, block_idx: int, preceding_output_channels: int, input_channels: int, overall_stride: int, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, batch_normalisation: bool = True, bias: bool = False)
 37    def __init__(
 38        self,
 39        outer_layer_name: str,
 40        block_idx: int,
 41        preceding_output_channels: int,
 42        input_channels: int,
 43        overall_stride: int,
 44        activation_layer: nn.Module | None = nn.ReLU,
 45        batch_normalisation: bool = True,
 46        bias: bool = False,
 47    ) -> None:
 48        r"""Construct and initialise the smaller building block.
 49
 50        **Args:**
 51        - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
 52        - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
 53        - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block.
 54        - **input_channels** (`int`): the number of channels of input of this building block.
 55        - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (last) convolutional layer where the 1st convolutional layer remain stride of 1.
 56        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 57        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
 58        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
 59        """
 60        CLBackbone.__init__(self, output_dim=None)
 61
 62        self.batch_normalisation: bool = batch_normalisation
 63        r"""Store whether to use batch normalisation after the fully-connected layers."""
 64        self.activation: bool = activation_layer is not None
 65        r"""Store whether to use activation function after the fully-connected layers."""
 66
 67        self.full_1st_layer_name = f"{outer_layer_name}/{block_idx}/conv1"
 68        r"""Format and store full name of the 1st weighted convolutional layer. """
 69        self.full_2nd_layer_name = f"{outer_layer_name}/{block_idx}/conv2"
 70        r"""Format and store full name of the 2nd weighted convolutional layer. """
 71
 72        # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc)
 73        layer_input_channels = preceding_output_channels  # the input channels of the 1st convolutional layer, which receive the output channels of the preceding module
 74        layer_output_channels = (
 75            input_channels  # the output channels of the 1st convolutional layer
 76        )
 77        self.conv1 = nn.Conv2d(
 78            in_channels=layer_input_channels,
 79            out_channels=layer_output_channels,
 80            kernel_size=3,
 81            stride=1,
 82            padding=1,
 83            bias=bias,
 84        )  # construct the 1st weight convolutional layer of the smaller building block. Overall stride is not performed here
 85        r"""The 1st weight convolutional layer of the smaller building block. """
 86        self.weighted_layer_names.append(
 87            self.full_1st_layer_name
 88        )  # update the weighted layer names
 89        if self.batch_normalisation:
 90            self.conv_bn1 = nn.BatchNorm2d(
 91                num_features=layer_output_channels
 92            )  # construct the batch normalisation layer
 93            r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """
 94        if self.activation:
 95            self.conv_activation1 = activation_layer()  # construct the activation layer
 96            r"""The activation layer after the 1st weighted convolutional layer. """
 97
 98        # construct the 2nd weighted convolutional layer and attached layers (batchnorm, activation, etc)
 99        layer_input_channels = input_channels  # the input channels of the 2nd convolutional layer, which is `input_channels`, the same as the output channels of the 1st convolutional layer
100        layer_output_channels = (
101            input_channels * 1
102        )  # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion)
103        self.conv2 = nn.Conv2d(
104            in_channels=layer_input_channels,
105            out_channels=layer_output_channels,
106            kernel_size=3,
107            stride=overall_stride,
108            padding=1,
109            bias=bias,
110        )  # construct the 2nd weight convolutional layer of the smaller building block. Overall stride is performed here
111        r"""The 2nd weight convolutional layer of the smaller building block. """
112        self.weighted_layer_names.append(
113            self.full_2nd_layer_name
114        )  # update the weighted layer names
115        if batch_normalisation:
116            self.conv_bn2 = nn.BatchNorm2d(
117                num_features=layer_output_channels
118            )  # construct the batch normalisation layer
119            r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 2nd weighted convolutional layer. """
120        if self.activation:
121            self.conv_activation2 = activation_layer()  # construct the activation layer
122            r"""The activation layer after the 2nd weighted convolutional layer. """
123
124        self.identity_downsample: nn.Module = (
125            nn.Conv2d(
126                in_channels=preceding_output_channels,
127                out_channels=input_channels,
128                kernel_size=1,
129                stride=overall_stride,
130                bias=False,
131            )
132            if preceding_output_channels != input_channels or overall_stride != 1
133            else None
134        )  # construct the identity downsample function
135        r"""The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists. """

Construct and initialise the smaller building block.

Args:

  • outer_layer_name (str): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
  • block_idx (int): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
  • preceding_output_channels (int): the number of channels of preceding output of this particular building block.
  • input_channels (int): the number of channels of input of this building block.
  • overall_stride (int): the overall stride of this building block. This stride is performed at 2nd (last) convolutional layer where the 1st convolutional layer remain stride of 1.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • batch_normalisation (bool): whether to use batch normalisation after the weight convolutional layers. Default True, same as what the original ResNet paper does.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalisation are doing the similar thing with bias.
batch_normalisation: bool

Store whether to use batch normalisation after the fully-connected layers.

activation: bool

Store whether to use activation function after the fully-connected layers.

full_1st_layer_name

Format and store full name of the 1st weighted convolutional layer.

full_2nd_layer_name

Format and store full name of the 2nd weighted convolutional layer.

conv1

The 1st weight convolutional layer of the smaller building block.

conv2

The 2nd weight convolutional layer of the smaller building block.

identity_downsample: torch.nn.modules.module.Module

The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists.

def forward( self, input: torch.Tensor) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
137    def forward(self, input: Tensor) -> tuple[Tensor, dict[str, Tensor]]:
138        r"""The forward pass for data.
139
140        **Args:**
141        - **input** (`Tensor`): the input feature maps.
142
143        **Returns:**
144        - **output_feature** (`Tensor`): the output feature maps.
145        - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
146        """
147        hidden_features = {}
148
149        identity = (
150            self.identity_downsample(input)
151            if self.identity_downsample is not None
152            else input
153        )  # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's
154
155        x = input
156        x = self.conv1(x)
157        if self.batch_normalisation:
158            x = self.conv_bn1(x)
159        if self.activation:
160            x = self.conv_activation1(x)
161        hidden_features[self.full_1st_layer_name] = x  # store the hidden feature
162
163        x = self.conv2(x)
164        if self.batch_normalisation:
165            x = self.conv_bn2(x)
166
167        x = x + identity
168        if self.activation:
169            x = self.conv_activation2(x)  # activation after the shortcut connection
170        hidden_features[self.full_2nd_layer_name] = x  # store the hidden feature
171
172        output_feature = x
173
174        return output_feature, hidden_features

The forward pass for data.

Args:

  • input (Tensor): the input feature maps.

Returns:

  • output_feature (Tensor): the output feature maps.
  • hidden_features (dict[str, Tensor]): the hidden features (after activation) in each weighted layer. Key (str) is the weighted layer name, value (Tensor) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
class ResNetBlockLarge(clarena.backbones.base.CLBackbone):
177class ResNetBlockLarge(CLBackbone):
178    r"""The larger building block for ResNet-50/101/152. It is referred to "bottleneck" building block in the paper.
179
180    It consists of 3 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (right) in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html).
181    """
182
183    def __init__(
184        self,
185        outer_layer_name: str,
186        block_idx: int,
187        preceding_output_channels: int,
188        input_channels: int,
189        overall_stride: int,
190        activation_layer: nn.Module | None = nn.ReLU,
191        batch_normalisation: bool = True,
192        bias: bool = False,
193    ) -> None:
194        r"""Construct and initialise the larger building block.
195
196        **Args:**
197        - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
198        - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
199        - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block.
200        - **input_channels** (`int`): the number of channels of input of this building block.
201        - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (middle) convolutional layer where 1st and 3rd convolutional layers remain stride of 1.
202        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
203        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
204        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
205        """
206        CLBackbone.__init__(self, output_dim=None)
207
208        self.batch_normalisation: bool = batch_normalisation
209        r"""Store whether to use batch normalisation after the fully-connected layers."""
210        self.activation: bool = activation_layer is not None
211        r"""Store whether to use activation function after the fully-connected layers."""
212
213        self.full_1st_layer_name = f"{outer_layer_name}/{block_idx}/conv1"
214        r"""Format and store full name of the 1st weighted convolutional layer. """
215        self.full_2nd_layer_name = f"{outer_layer_name}_{block_idx}_conv2"
216        r"""Format and store full name of the 2nd weighted convolutional layer. """
217        self.full_3rd_layer_name = f"{outer_layer_name}_{block_idx}_conv3"
218        r"""Format and store full name of the 3rd weighted convolutional layer. """
219
220        # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc)
221        layer_input_channels = preceding_output_channels  # the input channels of the 1st convolutional layer, which receive the output channels of the preceding module
222        layer_output_channels = (
223            input_channels  # the output channels of the 1st convolutional layer
224        )
225        self.conv1 = nn.Conv2d(
226            in_channels=layer_input_channels,
227            out_channels=layer_output_channels,
228            kernel_size=1,
229            stride=1,
230            padding=0,
231            bias=bias,
232        )  # construct the 1st weight convolutional layer of the larger building block. Overall stride is not performed here
233        r"""The 1st weight convolutional layer of the larger building block. """
234        self.weighted_layer_names.append(
235            self.full_1st_layer_name
236        )  # update the weighted layer names
237        if self.batch_normalisation:
238            self.conv_bn1 = nn.BatchNorm2d(
239                num_features=layer_output_channels
240            )  # construct the batch normalisation layer
241            r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """
242        if self.activation:
243            self.conv_activation1 = activation_layer()  # construct the activation layer
244            r"""The activation layer after the 1st weighted convolutional layer. """
245
246        # construct the 2nd weighted convolutional layer and attached layers (batchnorm, activation, etc)
247        layer_input_channels = input_channels  # the input channels of the 2nd convolutional layer, which is `input_channels`, the same as the output channels of the 1st convolutional layer
248        layer_output_channels = (
249            input_channels
250            * 1  # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion)
251        )
252        self.conv2 = nn.Conv2d(
253            in_channels=layer_input_channels,
254            out_channels=layer_output_channels,
255            kernel_size=3,
256            stride=overall_stride,
257            padding=1,
258            bias=bias,
259        )  # construct the 2nd weight convolutional layer of the larger building block. Overall stride is performed here
260        r"""The 2nd weight convolutional layer of the larger building block. """
261        self.weighted_layer_names.append(
262            self.full_2nd_layer_name
263        )  # update the weighted layer names
264        if self.batch_normalisation:
265            self.conv_bn2 = nn.BatchNorm2d(
266                num_features=layer_output_channels
267            )  # construct the batch normalisation layer
268            r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 2nd weighted convolutional layer. """
269        if self.activation:
270            self.conv_activation2 = activation_layer()  # construct the activation layer
271            r"""The activation layer after the 2nd weighted convolutional layer. """
272
273        # construct the 3rd weighted convolutional layer and attached layers (batchnorm, activation, etc)
274        layer_input_channels = (
275            input_channels * 1
276        )  # the input channels of the 2nd convolutional layer, which is `input_channels * 1`, the same as the output channels of the 1st convolutional layer
277        layer_output_channels = (
278            input_channels
279            * 4  # the output channels of the 2nd convolutional layer, which is 4 times expanded as the input channels
280        )
281        self.conv3 = nn.Conv2d(
282            in_channels=layer_input_channels,
283            out_channels=layer_output_channels,
284            kernel_size=1,
285            stride=1,
286            padding=0,
287            bias=bias,
288        )  # construct the 3rd weight convolutional layer of the larger building block. Overall stride is not performed here
289        r"""The 3rd weight convolutional layer of the larger building block. """
290        self.weighted_layer_names.append(
291            self.full_3rd_layer_name
292        )  # update the weighted layer names
293        if batch_normalisation:
294            self.conv_bn3 = nn.BatchNorm2d(
295                num_features=layer_output_channels
296            )  # construct the batch normalisation layer
297            r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 3rd weighted convolutional layer. """
298        if self.activation:
299            self.conv_activation3 = activation_layer()  # construct the activation layer
300            r"""The activation layer after the 3rd weighted convolutional layer. """
301
302        self.identity_downsample: nn.Module = (
303            nn.Conv2d(
304                in_channels=preceding_output_channels,
305                out_channels=input_channels * 4,
306                kernel_size=1,
307                stride=overall_stride,
308                bias=False,
309            )
310            if preceding_output_channels != input_channels * 4 or overall_stride != 1
311            else None
312        )
313        r"""The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists. """
314
315    def forward(self, input: Tensor) -> tuple[Tensor, dict[str, Tensor]]:
316        r"""The forward pass for data.
317
318        **Args:**
319        - **input** (`Tensor`): the input feature maps.
320
321        **Returns:**
322        - **output_feature** (`Tensor`): the output feature maps.
323        - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
324        """
325        hidden_features = {}
326
327        identity = (
328            self.identity_downsample(input)
329            if self.identity_downsample is not None
330            else input
331        )  # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's
332
333        x = input
334        x = self.conv1(x)
335        if self.batch_normalisation:
336            x = self.conv_bn1(x)
337        if self.activation:
338            x = self.conv_activation1(x)
339        hidden_features[self.full_1st_layer_name] = x  # store the hidden feature
340
341        x = self.conv2(x)
342        if self.batch_normalisation:
343            x = self.conv_bn2(x)
344        if self.activation:
345            x = self.conv_activation2(x)
346        hidden_features[self.full_2nd_layer_name] = x  # store the hidden feature
347
348        x = self.conv3(x)
349        if self.batch_normalisation:
350            x = self.conv_bn3(x)
351
352        x = x + identity
353        if self.activation:
354            x = self.conv_activation3(x)  # activation after the shortcut connection
355        hidden_features[self.full_3rd_layer_name] = x  # store the hidden feature
356
357        output_feature = x
358
359        return output_feature, hidden_features

The larger building block for ResNet-50/101/152. It is referred to "bottleneck" building block in the paper.

It consists of 3 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (right) in the original ResNet paper.

ResNetBlockLarge( outer_layer_name: str, block_idx: int, preceding_output_channels: int, input_channels: int, overall_stride: int, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, batch_normalisation: bool = True, bias: bool = False)
183    def __init__(
184        self,
185        outer_layer_name: str,
186        block_idx: int,
187        preceding_output_channels: int,
188        input_channels: int,
189        overall_stride: int,
190        activation_layer: nn.Module | None = nn.ReLU,
191        batch_normalisation: bool = True,
192        bias: bool = False,
193    ) -> None:
194        r"""Construct and initialise the larger building block.
195
196        **Args:**
197        - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
198        - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
199        - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block.
200        - **input_channels** (`int`): the number of channels of input of this building block.
201        - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (middle) convolutional layer where 1st and 3rd convolutional layers remain stride of 1.
202        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
203        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
204        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
205        """
206        CLBackbone.__init__(self, output_dim=None)
207
208        self.batch_normalisation: bool = batch_normalisation
209        r"""Store whether to use batch normalisation after the fully-connected layers."""
210        self.activation: bool = activation_layer is not None
211        r"""Store whether to use activation function after the fully-connected layers."""
212
213        self.full_1st_layer_name = f"{outer_layer_name}/{block_idx}/conv1"
214        r"""Format and store full name of the 1st weighted convolutional layer. """
215        self.full_2nd_layer_name = f"{outer_layer_name}_{block_idx}_conv2"
216        r"""Format and store full name of the 2nd weighted convolutional layer. """
217        self.full_3rd_layer_name = f"{outer_layer_name}_{block_idx}_conv3"
218        r"""Format and store full name of the 3rd weighted convolutional layer. """
219
220        # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc)
221        layer_input_channels = preceding_output_channels  # the input channels of the 1st convolutional layer, which receive the output channels of the preceding module
222        layer_output_channels = (
223            input_channels  # the output channels of the 1st convolutional layer
224        )
225        self.conv1 = nn.Conv2d(
226            in_channels=layer_input_channels,
227            out_channels=layer_output_channels,
228            kernel_size=1,
229            stride=1,
230            padding=0,
231            bias=bias,
232        )  # construct the 1st weight convolutional layer of the larger building block. Overall stride is not performed here
233        r"""The 1st weight convolutional layer of the larger building block. """
234        self.weighted_layer_names.append(
235            self.full_1st_layer_name
236        )  # update the weighted layer names
237        if self.batch_normalisation:
238            self.conv_bn1 = nn.BatchNorm2d(
239                num_features=layer_output_channels
240            )  # construct the batch normalisation layer
241            r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """
242        if self.activation:
243            self.conv_activation1 = activation_layer()  # construct the activation layer
244            r"""The activation layer after the 1st weighted convolutional layer. """
245
246        # construct the 2nd weighted convolutional layer and attached layers (batchnorm, activation, etc)
247        layer_input_channels = input_channels  # the input channels of the 2nd convolutional layer, which is `input_channels`, the same as the output channels of the 1st convolutional layer
248        layer_output_channels = (
249            input_channels
250            * 1  # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion)
251        )
252        self.conv2 = nn.Conv2d(
253            in_channels=layer_input_channels,
254            out_channels=layer_output_channels,
255            kernel_size=3,
256            stride=overall_stride,
257            padding=1,
258            bias=bias,
259        )  # construct the 2nd weight convolutional layer of the larger building block. Overall stride is performed here
260        r"""The 2nd weight convolutional layer of the larger building block. """
261        self.weighted_layer_names.append(
262            self.full_2nd_layer_name
263        )  # update the weighted layer names
264        if self.batch_normalisation:
265            self.conv_bn2 = nn.BatchNorm2d(
266                num_features=layer_output_channels
267            )  # construct the batch normalisation layer
268            r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 2nd weighted convolutional layer. """
269        if self.activation:
270            self.conv_activation2 = activation_layer()  # construct the activation layer
271            r"""The activation layer after the 2nd weighted convolutional layer. """
272
273        # construct the 3rd weighted convolutional layer and attached layers (batchnorm, activation, etc)
274        layer_input_channels = (
275            input_channels * 1
276        )  # the input channels of the 2nd convolutional layer, which is `input_channels * 1`, the same as the output channels of the 1st convolutional layer
277        layer_output_channels = (
278            input_channels
279            * 4  # the output channels of the 2nd convolutional layer, which is 4 times expanded as the input channels
280        )
281        self.conv3 = nn.Conv2d(
282            in_channels=layer_input_channels,
283            out_channels=layer_output_channels,
284            kernel_size=1,
285            stride=1,
286            padding=0,
287            bias=bias,
288        )  # construct the 3rd weight convolutional layer of the larger building block. Overall stride is not performed here
289        r"""The 3rd weight convolutional layer of the larger building block. """
290        self.weighted_layer_names.append(
291            self.full_3rd_layer_name
292        )  # update the weighted layer names
293        if batch_normalisation:
294            self.conv_bn3 = nn.BatchNorm2d(
295                num_features=layer_output_channels
296            )  # construct the batch normalisation layer
297            r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 3rd weighted convolutional layer. """
298        if self.activation:
299            self.conv_activation3 = activation_layer()  # construct the activation layer
300            r"""The activation layer after the 3rd weighted convolutional layer. """
301
302        self.identity_downsample: nn.Module = (
303            nn.Conv2d(
304                in_channels=preceding_output_channels,
305                out_channels=input_channels * 4,
306                kernel_size=1,
307                stride=overall_stride,
308                bias=False,
309            )
310            if preceding_output_channels != input_channels * 4 or overall_stride != 1
311            else None
312        )
313        r"""The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists. """

Construct and initialise the larger building block.

Args:

  • outer_layer_name (str): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
  • block_idx (int): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
  • preceding_output_channels (int): the number of channels of preceding output of this particular building block.
  • input_channels (int): the number of channels of input of this building block.
  • overall_stride (int): the overall stride of this building block. This stride is performed at 2nd (middle) convolutional layer where 1st and 3rd convolutional layers remain stride of 1.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • batch_normalisation (bool): whether to use batch normalisation after the weight convolutional layers. Default True, same as what the original ResNet paper does.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalisation are doing the similar thing with bias.
batch_normalisation: bool

Store whether to use batch normalisation after the fully-connected layers.

activation: bool

Store whether to use activation function after the fully-connected layers.

full_1st_layer_name

Format and store full name of the 1st weighted convolutional layer.

full_2nd_layer_name

Format and store full name of the 2nd weighted convolutional layer.

full_3rd_layer_name

Format and store full name of the 3rd weighted convolutional layer.

conv1

The 1st weight convolutional layer of the larger building block.

conv2

The 2nd weight convolutional layer of the larger building block.

conv3

The 3rd weight convolutional layer of the larger building block.

identity_downsample: torch.nn.modules.module.Module

The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists.

def forward( self, input: torch.Tensor) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
315    def forward(self, input: Tensor) -> tuple[Tensor, dict[str, Tensor]]:
316        r"""The forward pass for data.
317
318        **Args:**
319        - **input** (`Tensor`): the input feature maps.
320
321        **Returns:**
322        - **output_feature** (`Tensor`): the output feature maps.
323        - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
324        """
325        hidden_features = {}
326
327        identity = (
328            self.identity_downsample(input)
329            if self.identity_downsample is not None
330            else input
331        )  # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's
332
333        x = input
334        x = self.conv1(x)
335        if self.batch_normalisation:
336            x = self.conv_bn1(x)
337        if self.activation:
338            x = self.conv_activation1(x)
339        hidden_features[self.full_1st_layer_name] = x  # store the hidden feature
340
341        x = self.conv2(x)
342        if self.batch_normalisation:
343            x = self.conv_bn2(x)
344        if self.activation:
345            x = self.conv_activation2(x)
346        hidden_features[self.full_2nd_layer_name] = x  # store the hidden feature
347
348        x = self.conv3(x)
349        if self.batch_normalisation:
350            x = self.conv_bn3(x)
351
352        x = x + identity
353        if self.activation:
354            x = self.conv_activation3(x)  # activation after the shortcut connection
355        hidden_features[self.full_3rd_layer_name] = x  # store the hidden feature
356
357        output_feature = x
358
359        return output_feature, hidden_features

The forward pass for data.

Args:

  • input (Tensor): the input feature maps.

Returns:

  • output_feature (Tensor): the output feature maps.
  • hidden_features (dict[str, Tensor]): the hidden features (after activation) in each weighted layer. Key (str) is the weighted layer name, value (Tensor) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
class ResNetBase(clarena.backbones.base.CLBackbone):
362class ResNetBase(CLBackbone):
363    r"""The base class of [residual network (ResNet)](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html).
364
365    ResNet is a convolutional network architecture, which has 1st convolutional parameter layer and a maxpooling layer, connecting to 4 convolutional layers which contains multiple convolutional parameter layer. Each layer of the 4 are constructed from basic building blocks which are either small (`ResNetBlockSmall`) or large (`ResNetBlockLarge`). Each building block contains several convolutional parameter layers. The building blocks are connected by a skip connection which is a direct connection from the input of the block to the output of the block, and this is why it's called residual (find "shortcut connections" in the paper for more details). After the 5th convolutional layer, there are average pooling layer and a fully connected layer which connects to the CL output heads.
366    """
367
368    def __init__(
369        self,
370        input_channels: int,
371        building_block_type: ResNetBlockSmall | ResNetBlockLarge,
372        building_block_nums: tuple[int, int, int, int],
373        building_block_preceding_output_channels: tuple[int, int, int, int],
374        building_block_input_channels: tuple[int, int, int, int],
375        output_dim: int,
376        activation_layer: nn.Module | None = nn.ReLU,
377        batch_normalisation: bool = True,
378        bias: bool = False,
379    ) -> None:
380        r"""Construct and initialise the ResNet backbone network.
381
382        **Args:**
383        - **input_channels** (`int`): the number of channels of input. Image data are kept channels when going in ResNet. Note that convolutional networks require number of input channels instead of dimension.
384        - **building_block_type** (`ResNetBlockSmall` | `ResNetBlockLarge`): the type of building block used in the ResNet.
385        - **building_block_nums** (`tuple[int, int, int, int]`): the number of building blocks in the 2-5 convolutional layer correspondingly.
386        - **building_block_preceding_output_channels** (`tuple[int, int, int, int]`): the number of channels of preceding output of each building block in the 2-5 convolutional layer correspondingly.
387        - **building_block_input_channels** (`tuple[int, int, int, int]`): the number of channels of input of each building block in the 2-5 convolutional layer correspondingly.
388        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
389        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
390        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
391        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
392        """
393        CLBackbone.__init__(self, output_dim=output_dim)
394
395        self.batch_normalisation: bool = batch_normalisation
396        r"""Store whether to use batch normalisation after the fully-connected layers."""
397        self.activation: bool = activation_layer is not None
398        r"""Store whether to use activation function after the fully-connected layers."""
399
400        # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc)
401        layer_input_channels = input_channels  # the input channels of the 1st convolutional layer, which receive the input of the entire network
402        layer_output_channels = 64  # the output channels of the 1st convolutional layer
403        self.conv1 = nn.Conv2d(
404            in_channels=layer_input_channels,
405            out_channels=layer_output_channels,
406            kernel_size=7,
407            stride=2,
408            padding=3,
409            bias=bias,
410        )  # construct the 1st weight convolutional layer of the entire ResNet
411        r"""The 1st weight convolutional layer of the entire ResNet. It  is always with fixed kernel size 7x7, stride 2, and padding 3. """
412        self.weighted_layer_names.append("conv1")  # collect the layer name to be masked
413        if self.batch_normalisation:
414            self.conv_bn1 = nn.BatchNorm2d(
415                num_features=layer_output_channels
416            )  # construct the batch normalisation layer
417            r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """
418        if self.activation:
419            self.conv_activation1 = activation_layer()  # construct the activation layer
420            r"""The activation layer after the 1st weighted convolutional layer. """
421
422        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)  #
423        r"""The max pooling layer which is laid in between 1st and 2nd convolutional layers with kernel size 3x3, stride 2. """
424
425        # construct the 2nd convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
426        self.conv2x = self._multiple_blocks(
427            layer_name="conv2x",
428            building_block_type=building_block_type,
429            building_block_num=building_block_nums[0],
430            preceding_output_channels=building_block_preceding_output_channels[0],
431            input_channels=building_block_input_channels[0],
432            overall_stride=1,  # the overall stride of the 2nd convolutional layer should be 1, as the preceding maxpooling layer has stride 2, which already made 112x112 -> 56x56. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
433            activation_layer=activation_layer,
434            batch_normalisation=batch_normalisation,
435            bias=bias,
436        )
437        r"""The 2nd convolutional layer of the ResNet, which contains multiple blocks. """
438
439        # construct the 3rd convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
440        self.conv3x = self._multiple_blocks(
441            layer_name="conv3x",
442            building_block_type=building_block_type,
443            building_block_num=building_block_nums[1],
444            preceding_output_channels=building_block_preceding_output_channels[1],
445            input_channels=building_block_input_channels[1],
446            overall_stride=2,  # the overall stride of the 3rd convolutional layer should be 2, making 56x56 -> 28x28. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
447            activation_layer=activation_layer,
448            batch_normalisation=batch_normalisation,
449            bias=bias,
450        )
451        r"""The 3rd convolutional layer of the ResNet, which contains multiple blocks. """
452
453        # construct the 4th convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
454        self.conv4x = self._multiple_blocks(
455            layer_name="conv4x",
456            building_block_type=building_block_type,
457            building_block_num=building_block_nums[2],
458            preceding_output_channels=building_block_preceding_output_channels[2],
459            input_channels=building_block_input_channels[2],
460            overall_stride=2,  # the overall stride of the 4th convolutional layer should be 2, making 28x28 -> 14x14. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
461            activation_layer=activation_layer,
462            batch_normalisation=batch_normalisation,
463            bias=bias,
464        )
465        r"""The 4th convolutional layer of the ResNet, which contains multiple blocks. """
466
467        # construct the 5th convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
468        self.conv5x = self._multiple_blocks(
469            layer_name="conv5x",
470            building_block_type=building_block_type,
471            building_block_num=building_block_nums[3],
472            preceding_output_channels=building_block_preceding_output_channels[3],
473            input_channels=building_block_input_channels[3],
474            overall_stride=2,  # the overall stride of the 2nd convolutional layer should be 2, making 14x14 -> 7x7. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
475            activation_layer=activation_layer,
476            batch_normalisation=batch_normalisation,
477            bias=bias,
478        )
479        r"""The 5th convolutional layer of the ResNet, which contains multiple blocks. """
480
481        self.avepool = nn.AdaptiveAvgPool2d(output_size=(1, 1))
482        r"""The average pooling layer which is laid after the convolutional layers and before feature maps are flattened. """
483
484    def _multiple_blocks(
485        self,
486        layer_name: str,
487        building_block_type: ResNetBlockSmall | ResNetBlockLarge,
488        building_block_num: int,
489        preceding_output_channels: int,
490        input_channels: int,
491        overall_stride: int,
492        activation_layer: nn.Module | None = nn.ReLU,
493        batch_normalisation: bool = True,
494        bias: bool = False,
495    ) -> nn.Sequential:
496        r"""Construct a layer consisting of multiple building blocks. It's used to construct the 2-5 convolutional layers of the ResNet.
497
498        The "shortcut connections" are performed between the input and output of each building block:
499        1. If the input and output of the building block have exactly the same dimensions (including number of channels and size), add the input to the output.
500        2. If the input and output of the building block have different dimensions (including number of channels and size), add the input to the output after a convolutional layer to make the dimensions match.
501
502        **Args:**
503        - **layer_name** (`str`): pass the name of this multi-building-block layer to construct the full name of each weighted convolutional layer.
504        - **building_block_type** (`ResNetBlockSmall` | `ResNetBlockLarge`): the type of the building block.
505        - **building_block_num** (`int`): the number of building blocks in this multi-building-block layer.
506        - **preceding_output_channels** (`int`): the number of channels of preceding output of this entire multi-building-block layer.
507        - **input_channels** (`int`): the number of channels of input of this multi-building-block layer.
508        - **overall_stride** (`int`): the overall stride of the building blocks. This stride is performed at the 1st building block where other building blocks remain their own overall stride of 1. Inside that building block, this stride is performed at certain convolutional layer in the building block where other convolutional layers remain stride of 1:
509            - For `ResNetBlockSmall`, it performs at the 2nd (last) layer.
510            - For `ResNetBlockLarge`, it performs at the 2nd (middle) layer.
511        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
512        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
513        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
514
515        **Returns:**
516        - **layer** (`nn.Sequential`): the constructed layer consisting of multiple building blocks.
517        """
518
519        layer = []
520
521        for block_idx in range(building_block_num):
522            layer.append(
523                building_block_type(
524                    outer_layer_name=layer_name,
525                    block_idx=block_idx,
526                    preceding_output_channels=(
527                        preceding_output_channels
528                        if block_idx == 0
529                        else (
530                            input_channels
531                            if building_block_type == ResNetBlockSmall
532                            else input_channels * 4
533                        )
534                    ),  # if it's the 1st block in this multi-building-block layer, it should be the number of channels of the preceding output of this entire multi-building-block layer. Otherwise, it should be the number of channels from last building block where the number of channels is 4 times expanded as the input channels for `ResNetBlockLarge` than `ResNetBlockSmall`.
535                    input_channels=input_channels,
536                    overall_stride=(
537                        overall_stride if block_idx == 0 else 1
538                    ),  # only perform the overall stride at the 1st block in this multi-building-block layer
539                    activation_layer=activation_layer,
540                    batch_normalisation=batch_normalisation,
541                    bias=bias,
542                )
543            )
544
545            self.weighted_layer_names += layer[
546                -1
547            ].weighted_layer_names  # collect the weighted layer names in the blocks and sync to the weighted layer names list in the outer network
548
549        return nn.Sequential(*layer)
550
551    def forward(
552        self, input: Tensor, stage: str = None, task_id: int | None = None
553    ) -> tuple[Tensor, dict[str, Tensor]]:
554        r"""The forward pass for data. It is the same for all tasks.
555
556        **Args:**
557        - **input** (`Tensor`): the input tensor from data.
558
559        **Returns:**
560        - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation.
561        - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
562        """
563        batch_size = input.size(0)
564        hidden_features = {}
565
566        x = input
567
568        x = self.conv1(x)
569        if self.batch_normalisation:
570            x = self.conv_bn1(x)
571        if self.activation:
572            x = self.conv_activation1(x)
573        hidden_features["conv1"] = x
574
575        x = self.maxpool(x)
576
577        for block in self.conv2x:
578            x, hidden_features_block = block(x)
579            hidden_features.update(hidden_features_block)  # store the hidden feature
580        for block in self.conv3x:
581            x, hidden_features_block = block(x)
582            hidden_features.update(hidden_features_block)  # store the hidden feature
583        for block in self.conv4x:
584            x, hidden_features_block = block(x)
585            hidden_features.update(hidden_features_block)  # store the hidden feature
586        for block in self.conv5x:
587            x, hidden_features_block = block(x)
588            hidden_features.update(hidden_features_block)  # store the hidden feature
589
590        x = self.avepool(x)
591
592        output_feature = x.view(batch_size, -1)  # flatten before going through heads
593
594        return output_feature, hidden_features

The base class of residual network (ResNet).

ResNet is a convolutional network architecture, which has 1st convolutional parameter layer and a maxpooling layer, connecting to 4 convolutional layers which contains multiple convolutional parameter layer. Each layer of the 4 are constructed from basic building blocks which are either small (ResNetBlockSmall) or large (ResNetBlockLarge). Each building block contains several convolutional parameter layers. The building blocks are connected by a skip connection which is a direct connection from the input of the block to the output of the block, and this is why it's called residual (find "shortcut connections" in the paper for more details). After the 5th convolutional layer, there are average pooling layer and a fully connected layer which connects to the CL output heads.

ResNetBase( input_channels: int, building_block_type: ResNetBlockSmall | ResNetBlockLarge, building_block_nums: tuple[int, int, int, int], building_block_preceding_output_channels: tuple[int, int, int, int], building_block_input_channels: tuple[int, int, int, int], output_dim: int, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, batch_normalisation: bool = True, bias: bool = False)
368    def __init__(
369        self,
370        input_channels: int,
371        building_block_type: ResNetBlockSmall | ResNetBlockLarge,
372        building_block_nums: tuple[int, int, int, int],
373        building_block_preceding_output_channels: tuple[int, int, int, int],
374        building_block_input_channels: tuple[int, int, int, int],
375        output_dim: int,
376        activation_layer: nn.Module | None = nn.ReLU,
377        batch_normalisation: bool = True,
378        bias: bool = False,
379    ) -> None:
380        r"""Construct and initialise the ResNet backbone network.
381
382        **Args:**
383        - **input_channels** (`int`): the number of channels of input. Image data are kept channels when going in ResNet. Note that convolutional networks require number of input channels instead of dimension.
384        - **building_block_type** (`ResNetBlockSmall` | `ResNetBlockLarge`): the type of building block used in the ResNet.
385        - **building_block_nums** (`tuple[int, int, int, int]`): the number of building blocks in the 2-5 convolutional layer correspondingly.
386        - **building_block_preceding_output_channels** (`tuple[int, int, int, int]`): the number of channels of preceding output of each building block in the 2-5 convolutional layer correspondingly.
387        - **building_block_input_channels** (`tuple[int, int, int, int]`): the number of channels of input of each building block in the 2-5 convolutional layer correspondingly.
388        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
389        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
390        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
391        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
392        """
393        CLBackbone.__init__(self, output_dim=output_dim)
394
395        self.batch_normalisation: bool = batch_normalisation
396        r"""Store whether to use batch normalisation after the fully-connected layers."""
397        self.activation: bool = activation_layer is not None
398        r"""Store whether to use activation function after the fully-connected layers."""
399
400        # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc)
401        layer_input_channels = input_channels  # the input channels of the 1st convolutional layer, which receive the input of the entire network
402        layer_output_channels = 64  # the output channels of the 1st convolutional layer
403        self.conv1 = nn.Conv2d(
404            in_channels=layer_input_channels,
405            out_channels=layer_output_channels,
406            kernel_size=7,
407            stride=2,
408            padding=3,
409            bias=bias,
410        )  # construct the 1st weight convolutional layer of the entire ResNet
411        r"""The 1st weight convolutional layer of the entire ResNet. It  is always with fixed kernel size 7x7, stride 2, and padding 3. """
412        self.weighted_layer_names.append("conv1")  # collect the layer name to be masked
413        if self.batch_normalisation:
414            self.conv_bn1 = nn.BatchNorm2d(
415                num_features=layer_output_channels
416            )  # construct the batch normalisation layer
417            r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """
418        if self.activation:
419            self.conv_activation1 = activation_layer()  # construct the activation layer
420            r"""The activation layer after the 1st weighted convolutional layer. """
421
422        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)  #
423        r"""The max pooling layer which is laid in between 1st and 2nd convolutional layers with kernel size 3x3, stride 2. """
424
425        # construct the 2nd convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
426        self.conv2x = self._multiple_blocks(
427            layer_name="conv2x",
428            building_block_type=building_block_type,
429            building_block_num=building_block_nums[0],
430            preceding_output_channels=building_block_preceding_output_channels[0],
431            input_channels=building_block_input_channels[0],
432            overall_stride=1,  # the overall stride of the 2nd convolutional layer should be 1, as the preceding maxpooling layer has stride 2, which already made 112x112 -> 56x56. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
433            activation_layer=activation_layer,
434            batch_normalisation=batch_normalisation,
435            bias=bias,
436        )
437        r"""The 2nd convolutional layer of the ResNet, which contains multiple blocks. """
438
439        # construct the 3rd convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
440        self.conv3x = self._multiple_blocks(
441            layer_name="conv3x",
442            building_block_type=building_block_type,
443            building_block_num=building_block_nums[1],
444            preceding_output_channels=building_block_preceding_output_channels[1],
445            input_channels=building_block_input_channels[1],
446            overall_stride=2,  # the overall stride of the 3rd convolutional layer should be 2, making 56x56 -> 28x28. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
447            activation_layer=activation_layer,
448            batch_normalisation=batch_normalisation,
449            bias=bias,
450        )
451        r"""The 3rd convolutional layer of the ResNet, which contains multiple blocks. """
452
453        # construct the 4th convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
454        self.conv4x = self._multiple_blocks(
455            layer_name="conv4x",
456            building_block_type=building_block_type,
457            building_block_num=building_block_nums[2],
458            preceding_output_channels=building_block_preceding_output_channels[2],
459            input_channels=building_block_input_channels[2],
460            overall_stride=2,  # the overall stride of the 4th convolutional layer should be 2, making 28x28 -> 14x14. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
461            activation_layer=activation_layer,
462            batch_normalisation=batch_normalisation,
463            bias=bias,
464        )
465        r"""The 4th convolutional layer of the ResNet, which contains multiple blocks. """
466
467        # construct the 5th convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc)
468        self.conv5x = self._multiple_blocks(
469            layer_name="conv5x",
470            building_block_type=building_block_type,
471            building_block_num=building_block_nums[3],
472            preceding_output_channels=building_block_preceding_output_channels[3],
473            input_channels=building_block_input_channels[3],
474            overall_stride=2,  # the overall stride of the 2nd convolutional layer should be 2, making 14x14 -> 7x7. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details.
475            activation_layer=activation_layer,
476            batch_normalisation=batch_normalisation,
477            bias=bias,
478        )
479        r"""The 5th convolutional layer of the ResNet, which contains multiple blocks. """
480
481        self.avepool = nn.AdaptiveAvgPool2d(output_size=(1, 1))
482        r"""The average pooling layer which is laid after the convolutional layers and before feature maps are flattened. """

Construct and initialise the ResNet backbone network.

Args:

  • input_channels (int): the number of channels of input. Image data are kept channels when going in ResNet. Note that convolutional networks require number of input channels instead of dimension.
  • building_block_type (ResNetBlockSmall | ResNetBlockLarge): the type of building block used in the ResNet.
  • building_block_nums (tuple[int, int, int, int]): the number of building blocks in the 2-5 convolutional layer correspondingly.
  • building_block_preceding_output_channels (tuple[int, int, int, int]): the number of channels of preceding output of each building block in the 2-5 convolutional layer correspondingly.
  • building_block_input_channels (tuple[int, int, int, int]): the number of channels of input of each building block in the 2-5 convolutional layer correspondingly.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • batch_normalisation (bool): whether to use batch normalisation after the weight convolutional layers. Default True, same as what the original ResNet paper does.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalisation are doing the similar thing with bias.
batch_normalisation: bool

Store whether to use batch normalisation after the fully-connected layers.

activation: bool

Store whether to use activation function after the fully-connected layers.

conv1

The 1st weight convolutional layer of the entire ResNet. It is always with fixed kernel size 7x7, stride 2, and padding 3.

maxpool

The max pooling layer which is laid in between 1st and 2nd convolutional layers with kernel size 3x3, stride 2.

conv2x

The 2nd convolutional layer of the ResNet, which contains multiple blocks.

conv3x

The 3rd convolutional layer of the ResNet, which contains multiple blocks.

conv4x

The 4th convolutional layer of the ResNet, which contains multiple blocks.

conv5x

The 5th convolutional layer of the ResNet, which contains multiple blocks.

avepool

The average pooling layer which is laid after the convolutional layers and before feature maps are flattened.

def forward( self, input: torch.Tensor, stage: str = None, task_id: int | None = None) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
551    def forward(
552        self, input: Tensor, stage: str = None, task_id: int | None = None
553    ) -> tuple[Tensor, dict[str, Tensor]]:
554        r"""The forward pass for data. It is the same for all tasks.
555
556        **Args:**
557        - **input** (`Tensor`): the input tensor from data.
558
559        **Returns:**
560        - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation.
561        - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
562        """
563        batch_size = input.size(0)
564        hidden_features = {}
565
566        x = input
567
568        x = self.conv1(x)
569        if self.batch_normalisation:
570            x = self.conv_bn1(x)
571        if self.activation:
572            x = self.conv_activation1(x)
573        hidden_features["conv1"] = x
574
575        x = self.maxpool(x)
576
577        for block in self.conv2x:
578            x, hidden_features_block = block(x)
579            hidden_features.update(hidden_features_block)  # store the hidden feature
580        for block in self.conv3x:
581            x, hidden_features_block = block(x)
582            hidden_features.update(hidden_features_block)  # store the hidden feature
583        for block in self.conv4x:
584            x, hidden_features_block = block(x)
585            hidden_features.update(hidden_features_block)  # store the hidden feature
586        for block in self.conv5x:
587            x, hidden_features_block = block(x)
588            hidden_features.update(hidden_features_block)  # store the hidden feature
589
590        x = self.avepool(x)
591
592        output_feature = x.view(batch_size, -1)  # flatten before going through heads
593
594        return output_feature, hidden_features

The forward pass for data. It is the same for all tasks.

Args:

  • input (Tensor): the input tensor from data.

Returns:

  • output_feature (Tensor): the output feature tensor to be passed into heads. This is the main target of backpropagation.
  • hidden_features (dict[str, Tensor]): the hidden features (after activation) in each weighted layer. Key (str) is the weighted layer name, value (Tensor) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
class ResNet18(ResNetBase):
597class ResNet18(ResNetBase):
598    r"""ResNet-18 backbone network.
599
600    This is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 18 weight convolutional layers in total. See Table 1 in the paper for details.
601    """
602
603    def __init__(
604        self,
605        input_channels: int,
606        output_dim: int,
607        activation_layer: nn.Module | None = nn.ReLU,
608        batch_normalisation: bool = True,
609        bias: bool = False,
610    ) -> None:
611        r"""Construct and initialise the ResNet-18 backbone network.
612
613        **Args:**
614        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
615        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
616        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
617        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
618        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
619        """
620        ResNetBase.__init__(
621            self,
622            input_channels=input_channels,
623            building_block_type=ResNetBlockSmall,  # use the smaller building block for ResNet-18
624            building_block_nums=(2, 2, 2, 2),
625            building_block_preceding_output_channels=(64, 64, 128, 256),
626            building_block_input_channels=(64, 128, 256, 512),
627            output_dim=output_dim,
628            activation_layer=activation_layer,
629            batch_normalisation=batch_normalisation,
630            bias=bias,
631        )

ResNet-18 backbone network.

This is a smaller architecture proposed in the original ResNet paper. It consists of 18 weight convolutional layers in total. See Table 1 in the paper for details.

ResNet18( input_channels: int, output_dim: int, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, batch_normalisation: bool = True, bias: bool = False)
603    def __init__(
604        self,
605        input_channels: int,
606        output_dim: int,
607        activation_layer: nn.Module | None = nn.ReLU,
608        batch_normalisation: bool = True,
609        bias: bool = False,
610    ) -> None:
611        r"""Construct and initialise the ResNet-18 backbone network.
612
613        **Args:**
614        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
615        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
616        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
617        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
618        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
619        """
620        ResNetBase.__init__(
621            self,
622            input_channels=input_channels,
623            building_block_type=ResNetBlockSmall,  # use the smaller building block for ResNet-18
624            building_block_nums=(2, 2, 2, 2),
625            building_block_preceding_output_channels=(64, 64, 128, 256),
626            building_block_input_channels=(64, 128, 256, 512),
627            output_dim=output_dim,
628            activation_layer=activation_layer,
629            batch_normalisation=batch_normalisation,
630            bias=bias,
631        )

Construct and initialise the ResNet-18 backbone network.

Args:

  • input_channels (int): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • batch_normalisation (bool): whether to use batch normalisation after the weight convolutional layers. Default True, same as what the original ResNet paper does.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalisation are doing the similar thing with bias.
class ResNet34(ResNetBase):
634class ResNet34(ResNetBase):
635    r"""ResNet-34 backbone network.
636
637    This is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 34 weight convolutional layers in total. See Table 1 in the paper for details.
638    """
639
640    def __init__(
641        self,
642        input_channels: int,
643        output_dim: int,
644        activation_layer: nn.Module | None = nn.ReLU,
645        batch_normalisation: bool = True,
646        bias: bool = False,
647    ) -> None:
648        r"""Construct and initialise the ResNet-34 backbone network.
649
650        **Args:**
651        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
652        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
653        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
654        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
655        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
656        """
657        ResNetBase.__init__(
658            self,
659            input_channels=input_channels,
660            building_block_type=ResNetBlockSmall,  # use the smaller building block for ResNet-34
661            building_block_nums=(3, 4, 6, 3),
662            building_block_preceding_output_channels=(64, 64, 128, 256),
663            building_block_input_channels=(64, 128, 256, 512),
664            output_dim=output_dim,
665            activation_layer=activation_layer,
666            batch_normalisation=batch_normalisation,
667            bias=bias,
668        )

ResNet-34 backbone network.

This is a smaller architecture proposed in the original ResNet paper. It consists of 34 weight convolutional layers in total. See Table 1 in the paper for details.

ResNet34( input_channels: int, output_dim: int, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, batch_normalisation: bool = True, bias: bool = False)
640    def __init__(
641        self,
642        input_channels: int,
643        output_dim: int,
644        activation_layer: nn.Module | None = nn.ReLU,
645        batch_normalisation: bool = True,
646        bias: bool = False,
647    ) -> None:
648        r"""Construct and initialise the ResNet-34 backbone network.
649
650        **Args:**
651        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
652        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
653        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
654        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
655        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
656        """
657        ResNetBase.__init__(
658            self,
659            input_channels=input_channels,
660            building_block_type=ResNetBlockSmall,  # use the smaller building block for ResNet-34
661            building_block_nums=(3, 4, 6, 3),
662            building_block_preceding_output_channels=(64, 64, 128, 256),
663            building_block_input_channels=(64, 128, 256, 512),
664            output_dim=output_dim,
665            activation_layer=activation_layer,
666            batch_normalisation=batch_normalisation,
667            bias=bias,
668        )

Construct and initialise the ResNet-34 backbone network.

Args:

  • input_channels (int): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • batch_normalisation (bool): whether to use batch normalisation after the weight convolutional layers. Default True, same as what the original ResNet paper does.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalisation are doing the similar thing with bias.
class ResNet50(ResNetBase):
671class ResNet50(ResNetBase):
672    r"""ResNet-50 backbone network.
673
674    This is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 50 weight convolutional layers in total. See Table 1 in the paper for details.
675    """
676
677    def __init__(
678        self,
679        input_channels: int,
680        output_dim: int,
681        activation_layer: nn.Module | None = nn.ReLU,
682        batch_normalisation: bool = True,
683        bias: bool = False,
684    ) -> None:
685        r"""Construct and initialise the ResNet-50 backbone network.
686
687        **Args:**
688        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
689        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
690        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
691        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
692        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
693        """
694        ResNetBase.__init__(
695            self,
696            input_channels=input_channels,
697            building_block_type=ResNetBlockLarge,  # use the larger building block for ResNet-50
698            building_block_nums=(3, 4, 6, 3),
699            building_block_preceding_output_channels=(64, 256, 512, 1024),
700            building_block_input_channels=(64, 128, 256, 512),
701            output_dim=output_dim,
702            activation_layer=activation_layer,
703            batch_normalisation=batch_normalisation,
704            bias=bias,
705        )

ResNet-50 backbone network.

This is a larger architecture proposed in the original ResNet paper. It consists of 50 weight convolutional layers in total. See Table 1 in the paper for details.

ResNet50( input_channels: int, output_dim: int, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, batch_normalisation: bool = True, bias: bool = False)
677    def __init__(
678        self,
679        input_channels: int,
680        output_dim: int,
681        activation_layer: nn.Module | None = nn.ReLU,
682        batch_normalisation: bool = True,
683        bias: bool = False,
684    ) -> None:
685        r"""Construct and initialise the ResNet-50 backbone network.
686
687        **Args:**
688        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
689        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
690        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
691        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
692        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
693        """
694        ResNetBase.__init__(
695            self,
696            input_channels=input_channels,
697            building_block_type=ResNetBlockLarge,  # use the larger building block for ResNet-50
698            building_block_nums=(3, 4, 6, 3),
699            building_block_preceding_output_channels=(64, 256, 512, 1024),
700            building_block_input_channels=(64, 128, 256, 512),
701            output_dim=output_dim,
702            activation_layer=activation_layer,
703            batch_normalisation=batch_normalisation,
704            bias=bias,
705        )

Construct and initialise the ResNet-50 backbone network.

Args:

  • input_channels (int): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • batch_normalisation (bool): whether to use batch normalisation after the weight convolutional layers. Default True, same as what the original ResNet paper does.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalisation are doing the similar thing with bias.
class ResNet101(ResNetBase):
708class ResNet101(ResNetBase):
709    r"""ResNet-101 backbone network.
710
711    This is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 101 weight convolutional layers in total. See Table 1 in the paper for details.
712    """
713
714    def __init__(
715        self,
716        input_channels: int,
717        output_dim: int,
718        activation_layer: nn.Module | None = nn.ReLU,
719        batch_normalisation: bool = True,
720        bias: bool = False,
721    ) -> None:
722        r"""Construct and initialise the ResNet-101 backbone network.
723
724        **Args:**
725        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
726        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
727        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
728        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
729        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
730        """
731        ResNetBase.__init__(
732            self,
733            input_channels=input_channels,
734            building_block_type=ResNetBlockLarge,  # use the larger building block for ResNet-101
735            building_block_nums=(3, 4, 23, 3),
736            building_block_preceding_output_channels=(64, 256, 512, 1024),
737            building_block_input_channels=(64, 128, 256, 512),
738            output_dim=output_dim,
739            activation_layer=activation_layer,
740            batch_normalisation=batch_normalisation,
741            bias=bias,
742        )

ResNet-101 backbone network.

This is a larger architecture proposed in the original ResNet paper. It consists of 101 weight convolutional layers in total. See Table 1 in the paper for details.

ResNet101( input_channels: int, output_dim: int, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, batch_normalisation: bool = True, bias: bool = False)
714    def __init__(
715        self,
716        input_channels: int,
717        output_dim: int,
718        activation_layer: nn.Module | None = nn.ReLU,
719        batch_normalisation: bool = True,
720        bias: bool = False,
721    ) -> None:
722        r"""Construct and initialise the ResNet-101 backbone network.
723
724        **Args:**
725        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
726        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
727        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
728        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
729        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
730        """
731        ResNetBase.__init__(
732            self,
733            input_channels=input_channels,
734            building_block_type=ResNetBlockLarge,  # use the larger building block for ResNet-101
735            building_block_nums=(3, 4, 23, 3),
736            building_block_preceding_output_channels=(64, 256, 512, 1024),
737            building_block_input_channels=(64, 128, 256, 512),
738            output_dim=output_dim,
739            activation_layer=activation_layer,
740            batch_normalisation=batch_normalisation,
741            bias=bias,
742        )

Construct and initialise the ResNet-101 backbone network.

Args:

  • input_channels (int): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • batch_normalisation (bool): whether to use batch normalisation after the weight convolutional layers. Default True, same as what the original ResNet paper does.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalisation are doing the similar thing with bias.
class ResNet152(ResNetBase):
745class ResNet152(ResNetBase):
746    r"""ResNet-152 backbone network.
747
748    This is the largest architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 152 weight convolutional layers in total. See Table 1 in the paper for details.
749    """
750
751    def __init__(
752        self,
753        input_channels: int,
754        output_dim: int,
755        activation_layer: nn.Module | None = nn.ReLU,
756        batch_normalisation: bool = True,
757        bias: bool = False,
758    ) -> None:
759        r"""Construct and initialise the ResNet-50 backbone network.
760
761        **Args:**
762        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
763        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
764        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
765        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
766        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
767        """
768        ResNetBase.__init__(
769            self,
770            input_channels=input_channels,
771            building_block_type=ResNetBlockLarge,  # use the larger building block for ResNet-152
772            building_block_nums=(3, 8, 36, 3),
773            building_block_preceding_output_channels=(64, 256, 512, 1024),
774            building_block_input_channels=(64, 128, 256, 512),
775            output_dim=output_dim,
776            activation_layer=activation_layer,
777            batch_normalisation=batch_normalisation,
778            bias=bias,
779        )

ResNet-152 backbone network.

This is the largest architecture proposed in the original ResNet paper. It consists of 152 weight convolutional layers in total. See Table 1 in the paper for details.

ResNet152( input_channels: int, output_dim: int, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, batch_normalisation: bool = True, bias: bool = False)
751    def __init__(
752        self,
753        input_channels: int,
754        output_dim: int,
755        activation_layer: nn.Module | None = nn.ReLU,
756        batch_normalisation: bool = True,
757        bias: bool = False,
758    ) -> None:
759        r"""Construct and initialise the ResNet-50 backbone network.
760
761        **Args:**
762        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
763        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
764        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
765        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does.
766        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
767        """
768        ResNetBase.__init__(
769            self,
770            input_channels=input_channels,
771            building_block_type=ResNetBlockLarge,  # use the larger building block for ResNet-152
772            building_block_nums=(3, 8, 36, 3),
773            building_block_preceding_output_channels=(64, 256, 512, 1024),
774            building_block_input_channels=(64, 128, 256, 512),
775            output_dim=output_dim,
776            activation_layer=activation_layer,
777            batch_normalisation=batch_normalisation,
778            bias=bias,
779        )

Construct and initialise the ResNet-50 backbone network.

Args:

  • input_channels (int): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • batch_normalisation (bool): whether to use batch normalisation after the weight convolutional layers. Default True, same as what the original ResNet paper does.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalisation are doing the similar thing with bias.
class HATMaskResNetBlockSmall(clarena.backbones.base.HATMaskBackbone, ResNetBlockSmall):
782class HATMaskResNetBlockSmall(HATMaskBackbone, ResNetBlockSmall):
783    r"""The smaller building block for HAT masked ResNet-18/34.
784
785    It consists of 2 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (left) in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html).
786
787    Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
788    """
789
790    def __init__(
791        self,
792        outer_layer_name: str,
793        block_idx: int,
794        preceding_output_channels: int,
795        input_channels: int,
796        overall_stride: int,
797        gate: str,
798        activation_layer: nn.Module | None = nn.ReLU,
799        bias: bool = False,
800    ) -> None:
801        r"""Construct and initialise the smaller building block with task embedding.
802
803        **Args:**
804        - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
805        - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
806        - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block.
807        - **input_channels** (`int`): the number of channels of input of this building block.
808        - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (last) convolutional layer where the 1st convolutional layer remain stride of 1.
809        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
810            - `sigmoid`: the sigmoid function.
811        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
812        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
813        """
814        HATMaskBackbone.__init__(self, output_dim=None, gate=gate)
815        ResNetBlockSmall.__init__(
816            self,
817            outer_layer_name=outer_layer_name,
818            block_idx=block_idx,
819            preceding_output_channels=preceding_output_channels,
820            input_channels=input_channels,
821            overall_stride=overall_stride,
822            activation_layer=activation_layer,
823            bias=bias,
824        )
825        self.register_hat_mask_module_explicitly(gate=gate)
826
827        # construct the task embedding over the 1st weighted convolutional layer. It is channel-wise
828        layer_output_channels = (
829            input_channels  # the output channels of the 1st convolutional layer
830        )
831        self.task_embedding_t[self.full_1st_layer_name] = nn.Embedding(
832            num_embeddings=1, embedding_dim=layer_output_channels
833        )
834
835        # construct the task embedding over the 2nd weighted convolutional layer. It is channel-wise
836        layer_output_channels = (
837            input_channels * 1
838        )  # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion)
839        self.task_embedding_t[self.full_2nd_layer_name] = nn.Embedding(
840            num_embeddings=1, embedding_dim=layer_output_channels
841        )
842
843    def forward(
844        self,
845        input: Tensor,
846        stage: str,
847        s_max: float | None = None,
848        batch_idx: int | None = None,
849        num_batches: int | None = None,
850        test_mask: dict[str, Tensor] | None = None,
851    ) -> tuple[Tensor, dict[str, Tensor], dict[str, Tensor]]:
852        r"""The forward pass for data from task `task_id`. Task-specific mask for `task_id` are applied to the units which are channels in each weighted convolutional layer.
853
854        **Args:**
855        - **input** (`Tensor`): The input tensor from data.
856        - **stage** (`str`): the stage of the forward pass, should be one of the following:
857            1. 'train': training stage.
858            2. 'validation': validation stage.
859            3. 'test': testing stage.
860        - **s_max** (`float` | `None`): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in [HAT paper](http://proceedings.mlr.press/v80/serra18a).
861        - **batch_idx** (`int` | `None`): the current batch index. Applies only to training stage. For other stages, it is default `None`.
862        - **num_batches** (`int` | `None`): the total number of batches. Applies only to training stage. For other stages, it is default `None`.
863        - **test_mask** (`dict[str, Tensor]` | `None`): the binary mask used for test. Applies only to testing stage. For other stages, it is default `None`.
864
865        **Returns:**
866        - **output_feature** (`Tensor`): the output feature maps.
867        - **mask** (`dict[str, Tensor]`): the mask for the current task. Key (`str`) is layer name, value (`Tensor`) is the mask tensor. The mask tensor has size (number of units).
868        - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited this `forward()` method of `HAT` class.
869        """
870        hidden_features = {}
871
872        # get the mask for the current task from the task embedding in this stage
873        mask = self.get_mask(
874            stage=stage,
875            s_max=s_max,
876            batch_idx=batch_idx,
877            num_batches=num_batches,
878            test_mask=test_mask,
879        )
880
881        identity = (
882            self.identity_downsample(input)
883            if self.identity_downsample is not None
884            else input
885        )  # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's
886
887        x = input
888        x = self.conv1(x)  # weighted convolutional layer first
889        x = x * (
890            mask[self.full_1st_layer_name].view(1, -1, 1, 1)
891        )  # apply the mask to the 1st convolutional layer. Broadcast the dimension of mask to match the input
892        if self.activation:
893            x = self.conv_activation1(x)  # activation function third
894        hidden_features[self.full_1st_layer_name] = x  # store the hidden feature
895
896        x = self.conv2(x)  # weighted convolutional layer first
897        x = x + identity
898        x = x * (
899            mask[self.full_2nd_layer_name].view(1, -1, 1, 1)
900        )  # apply the mask to the 2nd convolutional layer after the shortcut connection. Broadcast the dimension of mask to match the input
901        if self.activation:
902            x = self.conv_activation2(x)  # activation after the shortcut connection
903        hidden_features[self.full_2nd_layer_name] = x  # store the hidden feature
904
905        output_feature = x
906
907        return output_feature, mask, hidden_features

The smaller building block for HAT masked ResNet-18/34.

It consists of 2 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (left) in the original ResNet paper.

Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.

HATMaskResNetBlockSmall( outer_layer_name: str, block_idx: int, preceding_output_channels: int, input_channels: int, overall_stride: int, gate: str, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, bias: bool = False)
790    def __init__(
791        self,
792        outer_layer_name: str,
793        block_idx: int,
794        preceding_output_channels: int,
795        input_channels: int,
796        overall_stride: int,
797        gate: str,
798        activation_layer: nn.Module | None = nn.ReLU,
799        bias: bool = False,
800    ) -> None:
801        r"""Construct and initialise the smaller building block with task embedding.
802
803        **Args:**
804        - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
805        - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
806        - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block.
807        - **input_channels** (`int`): the number of channels of input of this building block.
808        - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (last) convolutional layer where the 1st convolutional layer remain stride of 1.
809        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
810            - `sigmoid`: the sigmoid function.
811        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
812        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
813        """
814        HATMaskBackbone.__init__(self, output_dim=None, gate=gate)
815        ResNetBlockSmall.__init__(
816            self,
817            outer_layer_name=outer_layer_name,
818            block_idx=block_idx,
819            preceding_output_channels=preceding_output_channels,
820            input_channels=input_channels,
821            overall_stride=overall_stride,
822            activation_layer=activation_layer,
823            bias=bias,
824        )
825        self.register_hat_mask_module_explicitly(gate=gate)
826
827        # construct the task embedding over the 1st weighted convolutional layer. It is channel-wise
828        layer_output_channels = (
829            input_channels  # the output channels of the 1st convolutional layer
830        )
831        self.task_embedding_t[self.full_1st_layer_name] = nn.Embedding(
832            num_embeddings=1, embedding_dim=layer_output_channels
833        )
834
835        # construct the task embedding over the 2nd weighted convolutional layer. It is channel-wise
836        layer_output_channels = (
837            input_channels * 1
838        )  # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion)
839        self.task_embedding_t[self.full_2nd_layer_name] = nn.Embedding(
840            num_embeddings=1, embedding_dim=layer_output_channels
841        )

Construct and initialise the smaller building block with task embedding.

Args:

  • outer_layer_name (str): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
  • block_idx (int): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
  • preceding_output_channels (int): the number of channels of preceding output of this particular building block.
  • input_channels (int): the number of channels of input of this building block.
  • overall_stride (int): the overall stride of this building block. This stride is performed at 2nd (last) convolutional layer where the 1st convolutional layer remain stride of 1.
  • gate (str): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
    • sigmoid: the sigmoid function.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • bias (bool): whether to use bias in the convolutional layer. Default False.
def forward( self, input: torch.Tensor, stage: str, s_max: float | None = None, batch_idx: int | None = None, num_batches: int | None = None, test_mask: dict[str, torch.Tensor] | None = None) -> tuple[torch.Tensor, dict[str, torch.Tensor], dict[str, torch.Tensor]]:
843    def forward(
844        self,
845        input: Tensor,
846        stage: str,
847        s_max: float | None = None,
848        batch_idx: int | None = None,
849        num_batches: int | None = None,
850        test_mask: dict[str, Tensor] | None = None,
851    ) -> tuple[Tensor, dict[str, Tensor], dict[str, Tensor]]:
852        r"""The forward pass for data from task `task_id`. Task-specific mask for `task_id` are applied to the units which are channels in each weighted convolutional layer.
853
854        **Args:**
855        - **input** (`Tensor`): The input tensor from data.
856        - **stage** (`str`): the stage of the forward pass, should be one of the following:
857            1. 'train': training stage.
858            2. 'validation': validation stage.
859            3. 'test': testing stage.
860        - **s_max** (`float` | `None`): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in [HAT paper](http://proceedings.mlr.press/v80/serra18a).
861        - **batch_idx** (`int` | `None`): the current batch index. Applies only to training stage. For other stages, it is default `None`.
862        - **num_batches** (`int` | `None`): the total number of batches. Applies only to training stage. For other stages, it is default `None`.
863        - **test_mask** (`dict[str, Tensor]` | `None`): the binary mask used for test. Applies only to testing stage. For other stages, it is default `None`.
864
865        **Returns:**
866        - **output_feature** (`Tensor`): the output feature maps.
867        - **mask** (`dict[str, Tensor]`): the mask for the current task. Key (`str`) is layer name, value (`Tensor`) is the mask tensor. The mask tensor has size (number of units).
868        - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited this `forward()` method of `HAT` class.
869        """
870        hidden_features = {}
871
872        # get the mask for the current task from the task embedding in this stage
873        mask = self.get_mask(
874            stage=stage,
875            s_max=s_max,
876            batch_idx=batch_idx,
877            num_batches=num_batches,
878            test_mask=test_mask,
879        )
880
881        identity = (
882            self.identity_downsample(input)
883            if self.identity_downsample is not None
884            else input
885        )  # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's
886
887        x = input
888        x = self.conv1(x)  # weighted convolutional layer first
889        x = x * (
890            mask[self.full_1st_layer_name].view(1, -1, 1, 1)
891        )  # apply the mask to the 1st convolutional layer. Broadcast the dimension of mask to match the input
892        if self.activation:
893            x = self.conv_activation1(x)  # activation function third
894        hidden_features[self.full_1st_layer_name] = x  # store the hidden feature
895
896        x = self.conv2(x)  # weighted convolutional layer first
897        x = x + identity
898        x = x * (
899            mask[self.full_2nd_layer_name].view(1, -1, 1, 1)
900        )  # apply the mask to the 2nd convolutional layer after the shortcut connection. Broadcast the dimension of mask to match the input
901        if self.activation:
902            x = self.conv_activation2(x)  # activation after the shortcut connection
903        hidden_features[self.full_2nd_layer_name] = x  # store the hidden feature
904
905        output_feature = x
906
907        return output_feature, mask, hidden_features

The forward pass for data from task task_id. Task-specific mask for task_id are applied to the units which are channels in each weighted convolutional layer.

Args:

  • input (Tensor): The input tensor from data.
  • stage (str): the stage of the forward pass, should be one of the following:
    1. 'train': training stage.
    2. 'validation': validation stage.
    3. 'test': testing stage.
  • s_max (float | None): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in HAT paper.
  • batch_idx (int | None): the current batch index. Applies only to training stage. For other stages, it is default None.
  • num_batches (int | None): the total number of batches. Applies only to training stage. For other stages, it is default None.
  • test_mask (dict[str, Tensor] | None): the binary mask used for test. Applies only to testing stage. For other stages, it is default None.

Returns:

  • output_feature (Tensor): the output feature maps.
  • mask (dict[str, Tensor]): the mask for the current task. Key (str) is layer name, value (Tensor) is the mask tensor. The mask tensor has size (number of units).
  • hidden_features (dict[str, Tensor]): the hidden features (after activation) in each weighted layer. Key (str) is the weighted layer name, value (Tensor) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited this forward() method of HAT class.
class HATMaskResNetBlockLarge(clarena.backbones.base.HATMaskBackbone, ResNetBlockLarge):
 910class HATMaskResNetBlockLarge(HATMaskBackbone, ResNetBlockLarge):
 911    r"""The larger building block for ResNet-50/101/152. It is referred to "bottleneck" building block in the ResNet paper.
 912
 913    It consists of 3 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (right) in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html).
 914
 915    Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
 916    """
 917
 918    def __init__(
 919        self,
 920        outer_layer_name: str,
 921        block_idx: int,
 922        preceding_output_channels: int,
 923        input_channels: int,
 924        overall_stride: int,
 925        gate: str,
 926        activation_layer: nn.Module | None = nn.ReLU,
 927        bias: bool = False,
 928    ) -> None:
 929        r"""Construct and initialise the larger building block with task embedding.
 930
 931        **Args:**
 932        - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
 933        - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
 934        - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block.
 935        - **input_channels** (`int`): the number of channels of input of this building block.
 936        - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (middle) convolutional layer where 1st and 3rd convolutional layers remain stride of 1.
 937        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
 938            - `sigmoid`: the sigmoid function.
 939        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
 940        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
 941        """
 942        HATMaskBackbone.__init__(self, output_dim=None, gate=gate)
 943        ResNetBlockLarge.__init__(
 944            self,
 945            outer_layer_name=outer_layer_name,
 946            block_idx=block_idx,
 947            preceding_output_channels=preceding_output_channels,
 948            input_channels=input_channels,
 949            overall_stride=overall_stride,
 950            activation_layer=activation_layer,
 951            bias=bias,
 952        )
 953        self.register_hat_mask_module_explicitly(gate=gate)
 954
 955        # construct the task embedding over the 1st weighted convolutional layer. It is channel-wise
 956        layer_output_channels = (
 957            input_channels  # the output channels of the 1st convolutional layer
 958        )
 959        self.task_embedding_t[self.full_1st_layer_name] = nn.Embedding(
 960            num_embeddings=1, embedding_dim=layer_output_channels
 961        )
 962
 963        # construct the task embedding over the 2nd weighted convolutional layer. It is channel-wise
 964        layer_output_channels = (
 965            input_channels * 1
 966        )  # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion)
 967        self.task_embedding_t[self.full_2nd_layer_name] = nn.Embedding(
 968            num_embeddings=1, embedding_dim=layer_output_channels
 969        )
 970
 971        # construct the task embedding over the 3rd weighted convolutional layer. It is channel-wise
 972        layer_output_channels = (
 973            input_channels
 974            * 4  # the output channels of the 2nd convolutional layer, which is 4 times expanded as the input channels
 975        )
 976        self.task_embedding_t[self.full_3rd_layer_name] = nn.Embedding(
 977            num_embeddings=1, embedding_dim=layer_output_channels
 978        )
 979
 980    def forward(
 981        self,
 982        input: Tensor,
 983        stage: str,
 984        s_max: float | None = None,
 985        batch_idx: int | None = None,
 986        num_batches: int | None = None,
 987        test_mask: dict[str, Tensor] | None = None,
 988    ) -> tuple[Tensor, dict[str, Tensor], dict[str, Tensor]]:
 989        r"""The forward pass for data from task `task_id`. Task-specific mask for `task_id` are applied to the units which are channels in each weighted convolutional layer.
 990
 991        **Args:**
 992        - **input** (`Tensor`): The input tensor from data.
 993        - **stage** (`str`): the stage of the forward pass, should be one of the following:
 994            1. 'train': training stage.
 995            2. 'validation': validation stage.
 996            3. 'test': testing stage.
 997        - **s_max** (`float` | `None`): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in [HAT paper](http://proceedings.mlr.press/v80/serra18a).
 998        - **batch_idx** (`int` | `None`): the current batch index. Applies only to training stage. For other stages, it is default `None`.
 999        - **num_batches** (`int` | `None`): the total number of batches. Applies only to training stage. For other stages, it is default `None`.
1000        - **test_mask** (`dict[str, Tensor]` | `None`): the binary mask used for test. Applies only to testing stage. For other stages, it is default `None`.
1001
1002        **Returns:**
1003        - **output_feature** (`Tensor`): the output feature maps.
1004        - **mask** (`dict[str, Tensor]`): the mask for the current task. Key (`str`) is layer name, value (`Tensor`) is the mask tensor. The mask tensor has size (number of units).
1005        - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited this `forward()` method of `HAT` class.
1006        """
1007        hidden_features = {}
1008
1009        # get the mask for the current task from the task embedding in this stage
1010        mask = self.get_mask(
1011            stage=stage,
1012            s_max=s_max,
1013            batch_idx=batch_idx,
1014            num_batches=num_batches,
1015            test_mask=test_mask,
1016        )
1017
1018        identity = (
1019            self.identity_downsample(input)
1020            if self.identity_downsample is not None
1021            else input
1022        )  # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's
1023
1024        x = input
1025        x = self.conv1(x)  # weighted convolutional layer first
1026        x = x * (
1027            mask[self.full_1st_layer_name].view(1, -1, 1, 1)
1028        )  # apply the mask to the 1st convolutional layer. Broadcast the dimension of mask to match the input
1029        if self.activation:
1030            x = self.conv_activation1(x)  # activation function third
1031        hidden_features[self.full_1st_layer_name] = x  # store the hidden feature
1032
1033        x = self.conv2(x)  # weighted convolutional layer first
1034        x = x * (
1035            mask[self.full_2nd_layer_name].view(1, -1, 1, 1)
1036        )  # apply the mask to the 2nd convolutional layer. Broadcast the dimension of mask to match the input
1037        if self.activation:
1038            x = self.conv_activation2(x)  # activation function third
1039        hidden_features[self.full_2nd_layer_name] = x  # store the hidden feature
1040
1041        x = self.conv3(x)  # weighted convolutional layer first
1042        x = x + identity
1043        x = x * (
1044            mask[self.full_3rd_layer_name].view(1, -1, 1, 1)
1045        )  # apply the mask to the 3rd convolutional layer after the shortcut connection. Broadcast the dimension of mask to match the input
1046        if self.activation:
1047            x = self.activation3(x)  # activation after the shortcut connection
1048        hidden_features[self.full_3rd_layer_name] = x  # store the hidden feature
1049
1050        output_feature = x
1051
1052        return output_feature, mask, hidden_features

The larger building block for ResNet-50/101/152. It is referred to "bottleneck" building block in the ResNet paper.

It consists of 3 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (right) in the original ResNet paper.

Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.

HATMaskResNetBlockLarge( outer_layer_name: str, block_idx: int, preceding_output_channels: int, input_channels: int, overall_stride: int, gate: str, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, bias: bool = False)
918    def __init__(
919        self,
920        outer_layer_name: str,
921        block_idx: int,
922        preceding_output_channels: int,
923        input_channels: int,
924        overall_stride: int,
925        gate: str,
926        activation_layer: nn.Module | None = nn.ReLU,
927        bias: bool = False,
928    ) -> None:
929        r"""Construct and initialise the larger building block with task embedding.
930
931        **Args:**
932        - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
933        - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
934        - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block.
935        - **input_channels** (`int`): the number of channels of input of this building block.
936        - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (middle) convolutional layer where 1st and 3rd convolutional layers remain stride of 1.
937        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
938            - `sigmoid`: the sigmoid function.
939        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
940        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
941        """
942        HATMaskBackbone.__init__(self, output_dim=None, gate=gate)
943        ResNetBlockLarge.__init__(
944            self,
945            outer_layer_name=outer_layer_name,
946            block_idx=block_idx,
947            preceding_output_channels=preceding_output_channels,
948            input_channels=input_channels,
949            overall_stride=overall_stride,
950            activation_layer=activation_layer,
951            bias=bias,
952        )
953        self.register_hat_mask_module_explicitly(gate=gate)
954
955        # construct the task embedding over the 1st weighted convolutional layer. It is channel-wise
956        layer_output_channels = (
957            input_channels  # the output channels of the 1st convolutional layer
958        )
959        self.task_embedding_t[self.full_1st_layer_name] = nn.Embedding(
960            num_embeddings=1, embedding_dim=layer_output_channels
961        )
962
963        # construct the task embedding over the 2nd weighted convolutional layer. It is channel-wise
964        layer_output_channels = (
965            input_channels * 1
966        )  # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion)
967        self.task_embedding_t[self.full_2nd_layer_name] = nn.Embedding(
968            num_embeddings=1, embedding_dim=layer_output_channels
969        )
970
971        # construct the task embedding over the 3rd weighted convolutional layer. It is channel-wise
972        layer_output_channels = (
973            input_channels
974            * 4  # the output channels of the 2nd convolutional layer, which is 4 times expanded as the input channels
975        )
976        self.task_embedding_t[self.full_3rd_layer_name] = nn.Embedding(
977            num_embeddings=1, embedding_dim=layer_output_channels
978        )

Construct and initialise the larger building block with task embedding.

Args:

  • outer_layer_name (str): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer.
  • block_idx (int): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer.
  • preceding_output_channels (int): the number of channels of preceding output of this particular building block.
  • input_channels (int): the number of channels of input of this building block.
  • overall_stride (int): the overall stride of this building block. This stride is performed at 2nd (middle) convolutional layer where 1st and 3rd convolutional layers remain stride of 1.
  • gate (str): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
    • sigmoid: the sigmoid function.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • bias (bool): whether to use bias in the convolutional layer. Default False.
def forward( self, input: torch.Tensor, stage: str, s_max: float | None = None, batch_idx: int | None = None, num_batches: int | None = None, test_mask: dict[str, torch.Tensor] | None = None) -> tuple[torch.Tensor, dict[str, torch.Tensor], dict[str, torch.Tensor]]:
 980    def forward(
 981        self,
 982        input: Tensor,
 983        stage: str,
 984        s_max: float | None = None,
 985        batch_idx: int | None = None,
 986        num_batches: int | None = None,
 987        test_mask: dict[str, Tensor] | None = None,
 988    ) -> tuple[Tensor, dict[str, Tensor], dict[str, Tensor]]:
 989        r"""The forward pass for data from task `task_id`. Task-specific mask for `task_id` are applied to the units which are channels in each weighted convolutional layer.
 990
 991        **Args:**
 992        - **input** (`Tensor`): The input tensor from data.
 993        - **stage** (`str`): the stage of the forward pass, should be one of the following:
 994            1. 'train': training stage.
 995            2. 'validation': validation stage.
 996            3. 'test': testing stage.
 997        - **s_max** (`float` | `None`): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in [HAT paper](http://proceedings.mlr.press/v80/serra18a).
 998        - **batch_idx** (`int` | `None`): the current batch index. Applies only to training stage. For other stages, it is default `None`.
 999        - **num_batches** (`int` | `None`): the total number of batches. Applies only to training stage. For other stages, it is default `None`.
1000        - **test_mask** (`dict[str, Tensor]` | `None`): the binary mask used for test. Applies only to testing stage. For other stages, it is default `None`.
1001
1002        **Returns:**
1003        - **output_feature** (`Tensor`): the output feature maps.
1004        - **mask** (`dict[str, Tensor]`): the mask for the current task. Key (`str`) is layer name, value (`Tensor`) is the mask tensor. The mask tensor has size (number of units).
1005        - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited this `forward()` method of `HAT` class.
1006        """
1007        hidden_features = {}
1008
1009        # get the mask for the current task from the task embedding in this stage
1010        mask = self.get_mask(
1011            stage=stage,
1012            s_max=s_max,
1013            batch_idx=batch_idx,
1014            num_batches=num_batches,
1015            test_mask=test_mask,
1016        )
1017
1018        identity = (
1019            self.identity_downsample(input)
1020            if self.identity_downsample is not None
1021            else input
1022        )  # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's
1023
1024        x = input
1025        x = self.conv1(x)  # weighted convolutional layer first
1026        x = x * (
1027            mask[self.full_1st_layer_name].view(1, -1, 1, 1)
1028        )  # apply the mask to the 1st convolutional layer. Broadcast the dimension of mask to match the input
1029        if self.activation:
1030            x = self.conv_activation1(x)  # activation function third
1031        hidden_features[self.full_1st_layer_name] = x  # store the hidden feature
1032
1033        x = self.conv2(x)  # weighted convolutional layer first
1034        x = x * (
1035            mask[self.full_2nd_layer_name].view(1, -1, 1, 1)
1036        )  # apply the mask to the 2nd convolutional layer. Broadcast the dimension of mask to match the input
1037        if self.activation:
1038            x = self.conv_activation2(x)  # activation function third
1039        hidden_features[self.full_2nd_layer_name] = x  # store the hidden feature
1040
1041        x = self.conv3(x)  # weighted convolutional layer first
1042        x = x + identity
1043        x = x * (
1044            mask[self.full_3rd_layer_name].view(1, -1, 1, 1)
1045        )  # apply the mask to the 3rd convolutional layer after the shortcut connection. Broadcast the dimension of mask to match the input
1046        if self.activation:
1047            x = self.activation3(x)  # activation after the shortcut connection
1048        hidden_features[self.full_3rd_layer_name] = x  # store the hidden feature
1049
1050        output_feature = x
1051
1052        return output_feature, mask, hidden_features

The forward pass for data from task task_id. Task-specific mask for task_id are applied to the units which are channels in each weighted convolutional layer.

Args:

  • input (Tensor): The input tensor from data.
  • stage (str): the stage of the forward pass, should be one of the following:
    1. 'train': training stage.
    2. 'validation': validation stage.
    3. 'test': testing stage.
  • s_max (float | None): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in HAT paper.
  • batch_idx (int | None): the current batch index. Applies only to training stage. For other stages, it is default None.
  • num_batches (int | None): the total number of batches. Applies only to training stage. For other stages, it is default None.
  • test_mask (dict[str, Tensor] | None): the binary mask used for test. Applies only to testing stage. For other stages, it is default None.

Returns:

  • output_feature (Tensor): the output feature maps.
  • mask (dict[str, Tensor]): the mask for the current task. Key (str) is layer name, value (Tensor) is the mask tensor. The mask tensor has size (number of units).
  • hidden_features (dict[str, Tensor]): the hidden features (after activation) in each weighted layer. Key (str) is the weighted layer name, value (Tensor) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited this forward() method of HAT class.
class HATMaskResNetBase(ResNetBase, clarena.backbones.base.HATMaskBackbone):
1055class HATMaskResNetBase(ResNetBase, HATMaskBackbone):
1056    r"""The base class of HAT masked [residual network (ResNet)](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html).
1057
1058    [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.
1059
1060    ResNet is a convolutional network architecture, which has 1st convolutional parameter layer and a maxpooling layer, connecting to 4 convolutional layers which contains multiple convolutional parameter layer. Each layer of the 4 are constructed from basic building blocks which are either small (`ResNetBlockSmall`) or large (`ResNetBlockLarge`). Each building block contains several convolutional parameter layers. The building blocks are connected by a skip connection which is a direct connection from the input of the block to the output of the block, and this is why it's called residual (find "shortcut connections" in the paper for more details). After the 5th convolutional layer, there are average pooling layer and a fully connected layer which connects to the CL output heads.
1061
1062    Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
1063    """
1064
1065    def __init__(
1066        self,
1067        input_channels: int,
1068        building_block_type: HATMaskResNetBlockSmall | HATMaskResNetBlockLarge,
1069        building_block_nums: tuple[int, int, int, int],
1070        building_block_preceding_output_channels: tuple[int, int, int, int],
1071        building_block_input_channels: tuple[int, int, int, int],
1072        output_dim: int,
1073        gate: str,
1074        activation_layer: nn.Module | None = nn.ReLU,
1075        bias: bool = False,
1076    ) -> None:
1077        r"""Construct and initialise the HAT masked ResNet backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
1078
1079        **Args:**
1080        - **input_channels** (`int`): the number of channels of input. Image data are kept channels when going in ResNet. Note that convolutional networks require number of input channels instead of dimension.
1081        - **building_block_type** (`HATMaskResNetBlockSmall` | `HATMaskResNetBlockLarge`): the type of building block used in the ResNet.
1082        - **building_block_nums** (`tuple[int, int, int, int]`): the number of building blocks in the 2-5 convolutional layer correspondingly.
1083        - **building_block_preceding_output_channels** (`tuple[int, int, int, int]`): the number of channels of preceding output of each building block in the 2-5 convolutional layer correspondingly.
1084        - **building_block_input_channels** (`tuple[int, int, int, int]`): the number of channels of input of each building block in the 2-5 convolutional layer correspondingly.
1085        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1086        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
1087            - `sigmoid`: the sigmoid function.
1088        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1089        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
1090        """
1091        # init from both inherited classes
1092        HATMaskBackbone.__init__(self, output_dim=output_dim, gate=gate)
1093        ResNetBase.__init__(
1094            self,
1095            input_channels=input_channels,
1096            building_block_type=building_block_type,
1097            building_block_nums=building_block_nums,
1098            building_block_preceding_output_channels=building_block_preceding_output_channels,
1099            building_block_input_channels=building_block_input_channels,
1100            output_dim=output_dim,
1101            activation_layer=activation_layer,
1102            batch_normalisation=False,  # batch normalisation is incompatible with HAT mechanism
1103            bias=bias,
1104        )
1105        self.register_hat_mask_module_explicitly(
1106            gate=gate
1107        )  # register all `nn.Module`s for HATMaskBackbone explicitly because the second `__init__()` wipes out them inited by the first `__init__()`
1108        self.update_multiple_blocks_task_embedding()
1109
1110        # construct the task embedding over the 1st weighted convolutional layers. It is channel-wise
1111        layer_output_channels = 64  # the output channels of the 1st convolutional layer
1112        self.task_embedding_t["conv1"] = nn.Embedding(
1113            num_embeddings=1, embedding_dim=layer_output_channels
1114        )
1115
1116    def _multiple_blocks(
1117        self,
1118        layer_name: str,
1119        building_block_type: HATMaskResNetBlockSmall | HATMaskResNetBlockLarge,
1120        building_block_num: int,
1121        preceding_output_channels: int,
1122        input_channels: int,
1123        overall_stride: int,
1124        activation_layer: nn.Module | None = nn.ReLU,
1125        batch_normalisation: bool = False,
1126        bias: bool = False,
1127    ) -> None:
1128        r"""Construct a layer consisting of multiple building blocks with task embedding. It's used to construct the 2-5 convolutional layers of the HAT masked ResNet.
1129
1130        The "shortcut connections" are performed between the input and output of each building block:
1131        1. If the input and output of the building block have exactly the same dimensions (including number of channels and size), add the input to the output.
1132        2. If the input and output of the building block have different dimensions (including number of channels and size), add the input to the output after a convolutional layer to make the dimensions match.
1133
1134        **Args:**
1135        - **layer_name** (`str`): pass the name of this multi-building-block layer to construct the full name of each weighted convolutional layer.
1136        - **building_block_type** (`HATMaskResNetBlockSmall` | `HATMaskResNetBlockLarge`): the type of the building block.
1137        - **building_block_num** (`int`): the number of building blocks in this multi-building-block layer.
1138        - **preceding_output_channels** (`int`): the number of channels of preceding output of this entire multi-building-block layer.
1139        - **input_channels** (`int`): the number of channels of input of this multi-building-block layer.
1140        - **overall_stride** (`int`): the overall stride of the building blocks. This stride is performed at the 1st building block where other building blocks remain their own overall stride of 1. Inside that building block, this stride is performed at certain convolutional layer in the building block where other convolutional layers remain stride of 1:
1141            - For `ResNetBlockSmall`, it performs at the 2nd (last) layer.
1142            - For `ResNetBlockLarge`, it performs at the 2nd (middle) layer.
1143        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1144        - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. In HATMaskResNet, batch normalisation is incompatible with HAT mechanism and shoule be always set `False`. We include this argument for compatibility with the original ResNet API.
1145        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
1146
1147        **Returns:**
1148        - **layer** (`nn.Sequential`): the constructed layer consisting of multiple building blocks.
1149        """
1150
1151        layer = []
1152
1153        for block_idx in range(building_block_num):
1154            layer.append(
1155                building_block_type(
1156                    outer_layer_name=layer_name,
1157                    block_idx=block_idx,
1158                    preceding_output_channels=(
1159                        preceding_output_channels
1160                        if block_idx == 0
1161                        else (
1162                            input_channels
1163                            if building_block_type == HATMaskResNetBlockSmall
1164                            else input_channels * 4
1165                        )
1166                    ),  # if it's the 1st block in this multi-building-block layer, it should be the number of channels of the preceding output of this entire multi-building-block layer. Otherwise, it should be the number of channels from last building block where the number of channels is 4 times of the input channels for `ResNetBlockLarge` than `ResNetBlockSmall`.
1167                    input_channels=input_channels,
1168                    overall_stride=(
1169                        overall_stride if block_idx == 0 else 1
1170                    ),  # only perform the overall stride at the 1st block in this multi-building-block layer
1171                    gate=self.gate,
1172                    # no batch normalisation in HAT masked blocks
1173                    activation_layer=activation_layer,
1174                    bias=bias,
1175                )
1176            )
1177
1178            self.weighted_layer_names += layer[
1179                -1
1180            ].weighted_layer_names  # collect the weighted layer names in the blocks and sync to the weighted layer names list in the outer network
1181
1182        return nn.Sequential(*layer)
1183
1184    def update_multiple_blocks_task_embedding(self) -> None:
1185        r"""Collect the task embeddings in the multiple building blocks (2-5 convolutional layers) and sync to the weighted layer names list in the outer network.
1186
1187        This should only be called explicitly after the `__init__()` method, just because task embedding as `nn.Module` instance was wiped out at the beginning of it.
1188        """
1189        for block in self.conv2x:
1190            self.task_embedding_t.update(block.task_embedding_t)
1191        for block in self.conv3x:
1192            self.task_embedding_t.update(block.task_embedding_t)
1193        for block in self.conv4x:
1194            self.task_embedding_t.update(block.task_embedding_t)
1195        for block in self.conv5x:
1196            self.task_embedding_t.update(block.task_embedding_t)
1197
1198    def forward(
1199        self,
1200        input: Tensor,
1201        stage: str,
1202        s_max: float | None = None,
1203        batch_idx: int | None = None,
1204        num_batches: int | None = None,
1205        test_mask: dict[str, Tensor] | None = None,
1206    ) -> tuple[Tensor, dict[str, Tensor], dict[str, Tensor]]:
1207        r"""The forward pass for data from task `task_id`. Task-specific mask for `task_id` are applied to the units which are channels in each weighted convolutional layer.
1208
1209        **Args:**
1210        - **input** (`Tensor`): the input tensor from data.
1211        - **stage** (`str`): the stage of the forward pass, should be one of the following:
1212            1. 'train': training stage.
1213            2. 'validation': validation stage.
1214            3. 'test': testing stage.
1215        - **s_max** (`float` | `None`): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in [HAT paper](http://proceedings.mlr.press/v80/serra18a).
1216        - **batch_idx** (`int` | `None`): the current batch index. Applies only to training stage. For other stages, it is default `None`.
1217        - **num_batches** (`int` | `None`): the total number of batches. Applies only to training stage. For other stages, it is default `None`.
1218        - **test_mask** (`dict[str, Tensor]` | `None`): the binary mask used for test. Applies only to testing stage. For other stages, it is default `None`.
1219
1220        **Returns:**
1221        - **output_feature** (`Tensor`): the output feature tensor to be passed to the heads.
1222        - **mask** (`dict[str, Tensor]`): the mask for the current task. Key (`str`) is layer name, value (`Tensor`) is the mask tensor. The mask tensor has size (number of units).
1223        - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited this `forward()` method of `HAT` class.
1224        """
1225        batch_size = input.size(0)
1226        hidden_features = {}
1227
1228        # get the mask for the current task from the task embedding in this stage
1229        mask = self.get_mask(
1230            stage=stage,
1231            s_max=s_max,
1232            batch_idx=batch_idx,
1233            num_batches=num_batches,
1234            test_mask=test_mask,
1235        )
1236
1237        x = input
1238
1239        x = self.conv1(x)
1240
1241        x = x * (
1242            mask["conv1"].view(1, -1, 1, 1)
1243        )  # apply the mask to the 1st convolutional layer. Broadcast the dimension of mask to match the input
1244        if self.activation:
1245            x = self.conv_activation1(x)
1246        hidden_features["conv1"] = x
1247
1248        x = self.maxpool(x)
1249
1250        for block in self.conv2x:
1251            x, _, hidden_features_block = block(
1252                x,
1253                stage=stage,
1254                s_max=s_max,
1255                batch_idx=batch_idx,
1256                num_batches=num_batches,
1257                test_mask=test_mask,
1258            )
1259            hidden_features.update(hidden_features_block)  # store the hidden feature
1260        for block in self.conv3x:
1261            x, _, hidden_features_block = block(
1262                x,
1263                stage=stage,
1264                s_max=s_max,
1265                batch_idx=batch_idx,
1266                num_batches=num_batches,
1267                test_mask=test_mask,
1268            )
1269            hidden_features.update(hidden_features_block)  # store the hidden feature
1270        for block in self.conv4x:
1271            x, _, hidden_features_block = block(
1272                x,
1273                stage=stage,
1274                s_max=s_max,
1275                batch_idx=batch_idx,
1276                num_batches=num_batches,
1277                test_mask=test_mask,
1278            )
1279            hidden_features.update(hidden_features_block)  # store the hidden feature
1280        for block in self.conv5x:
1281            x, _, hidden_features_block = block(
1282                x,
1283                stage=stage,
1284                s_max=s_max,
1285                batch_idx=batch_idx,
1286                num_batches=num_batches,
1287                test_mask=test_mask,
1288            )
1289            hidden_features.update(hidden_features_block)  # store the hidden feature
1290
1291        x = self.avepool(x)
1292
1293        output_feature = x.view(batch_size, -1)  # flatten before going through heads
1294
1295        return output_feature, mask, hidden_features

The base class of HAT masked residual network (ResNet).

HAT (Hard Attention to the Task, 2018) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.

ResNet is a convolutional network architecture, which has 1st convolutional parameter layer and a maxpooling layer, connecting to 4 convolutional layers which contains multiple convolutional parameter layer. Each layer of the 4 are constructed from basic building blocks which are either small (ResNetBlockSmall) or large (ResNetBlockLarge). Each building block contains several convolutional parameter layers. The building blocks are connected by a skip connection which is a direct connection from the input of the block to the output of the block, and this is why it's called residual (find "shortcut connections" in the paper for more details). After the 5th convolutional layer, there are average pooling layer and a fully connected layer which connects to the CL output heads.

Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.

HATMaskResNetBase( input_channels: int, building_block_type: HATMaskResNetBlockSmall | HATMaskResNetBlockLarge, building_block_nums: tuple[int, int, int, int], building_block_preceding_output_channels: tuple[int, int, int, int], building_block_input_channels: tuple[int, int, int, int], output_dim: int, gate: str, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, bias: bool = False)
1065    def __init__(
1066        self,
1067        input_channels: int,
1068        building_block_type: HATMaskResNetBlockSmall | HATMaskResNetBlockLarge,
1069        building_block_nums: tuple[int, int, int, int],
1070        building_block_preceding_output_channels: tuple[int, int, int, int],
1071        building_block_input_channels: tuple[int, int, int, int],
1072        output_dim: int,
1073        gate: str,
1074        activation_layer: nn.Module | None = nn.ReLU,
1075        bias: bool = False,
1076    ) -> None:
1077        r"""Construct and initialise the HAT masked ResNet backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
1078
1079        **Args:**
1080        - **input_channels** (`int`): the number of channels of input. Image data are kept channels when going in ResNet. Note that convolutional networks require number of input channels instead of dimension.
1081        - **building_block_type** (`HATMaskResNetBlockSmall` | `HATMaskResNetBlockLarge`): the type of building block used in the ResNet.
1082        - **building_block_nums** (`tuple[int, int, int, int]`): the number of building blocks in the 2-5 convolutional layer correspondingly.
1083        - **building_block_preceding_output_channels** (`tuple[int, int, int, int]`): the number of channels of preceding output of each building block in the 2-5 convolutional layer correspondingly.
1084        - **building_block_input_channels** (`tuple[int, int, int, int]`): the number of channels of input of each building block in the 2-5 convolutional layer correspondingly.
1085        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1086        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
1087            - `sigmoid`: the sigmoid function.
1088        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1089        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias.
1090        """
1091        # init from both inherited classes
1092        HATMaskBackbone.__init__(self, output_dim=output_dim, gate=gate)
1093        ResNetBase.__init__(
1094            self,
1095            input_channels=input_channels,
1096            building_block_type=building_block_type,
1097            building_block_nums=building_block_nums,
1098            building_block_preceding_output_channels=building_block_preceding_output_channels,
1099            building_block_input_channels=building_block_input_channels,
1100            output_dim=output_dim,
1101            activation_layer=activation_layer,
1102            batch_normalisation=False,  # batch normalisation is incompatible with HAT mechanism
1103            bias=bias,
1104        )
1105        self.register_hat_mask_module_explicitly(
1106            gate=gate
1107        )  # register all `nn.Module`s for HATMaskBackbone explicitly because the second `__init__()` wipes out them inited by the first `__init__()`
1108        self.update_multiple_blocks_task_embedding()
1109
1110        # construct the task embedding over the 1st weighted convolutional layers. It is channel-wise
1111        layer_output_channels = 64  # the output channels of the 1st convolutional layer
1112        self.task_embedding_t["conv1"] = nn.Embedding(
1113            num_embeddings=1, embedding_dim=layer_output_channels
1114        )

Construct and initialise the HAT masked ResNet backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.

Args:

  • input_channels (int): the number of channels of input. Image data are kept channels when going in ResNet. Note that convolutional networks require number of input channels instead of dimension.
  • building_block_type (HATMaskResNetBlockSmall | HATMaskResNetBlockLarge): the type of building block used in the ResNet.
  • building_block_nums (tuple[int, int, int, int]): the number of building blocks in the 2-5 convolutional layer correspondingly.
  • building_block_preceding_output_channels (tuple[int, int, int, int]): the number of channels of preceding output of each building block in the 2-5 convolutional layer correspondingly.
  • building_block_input_channels (tuple[int, int, int, int]): the number of channels of input of each building block in the 2-5 convolutional layer correspondingly.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • gate (str): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
    • sigmoid: the sigmoid function.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • bias (bool): whether to use bias in the convolutional layer. Default False, because batch normalisation are doing the similar thing with bias.
def update_multiple_blocks_task_embedding(self) -> None:
1184    def update_multiple_blocks_task_embedding(self) -> None:
1185        r"""Collect the task embeddings in the multiple building blocks (2-5 convolutional layers) and sync to the weighted layer names list in the outer network.
1186
1187        This should only be called explicitly after the `__init__()` method, just because task embedding as `nn.Module` instance was wiped out at the beginning of it.
1188        """
1189        for block in self.conv2x:
1190            self.task_embedding_t.update(block.task_embedding_t)
1191        for block in self.conv3x:
1192            self.task_embedding_t.update(block.task_embedding_t)
1193        for block in self.conv4x:
1194            self.task_embedding_t.update(block.task_embedding_t)
1195        for block in self.conv5x:
1196            self.task_embedding_t.update(block.task_embedding_t)

Collect the task embeddings in the multiple building blocks (2-5 convolutional layers) and sync to the weighted layer names list in the outer network.

This should only be called explicitly after the __init__() method, just because task embedding as nn.Module instance was wiped out at the beginning of it.

def forward( self, input: torch.Tensor, stage: str, s_max: float | None = None, batch_idx: int | None = None, num_batches: int | None = None, test_mask: dict[str, torch.Tensor] | None = None) -> tuple[torch.Tensor, dict[str, torch.Tensor], dict[str, torch.Tensor]]:
1198    def forward(
1199        self,
1200        input: Tensor,
1201        stage: str,
1202        s_max: float | None = None,
1203        batch_idx: int | None = None,
1204        num_batches: int | None = None,
1205        test_mask: dict[str, Tensor] | None = None,
1206    ) -> tuple[Tensor, dict[str, Tensor], dict[str, Tensor]]:
1207        r"""The forward pass for data from task `task_id`. Task-specific mask for `task_id` are applied to the units which are channels in each weighted convolutional layer.
1208
1209        **Args:**
1210        - **input** (`Tensor`): the input tensor from data.
1211        - **stage** (`str`): the stage of the forward pass, should be one of the following:
1212            1. 'train': training stage.
1213            2. 'validation': validation stage.
1214            3. 'test': testing stage.
1215        - **s_max** (`float` | `None`): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in [HAT paper](http://proceedings.mlr.press/v80/serra18a).
1216        - **batch_idx** (`int` | `None`): the current batch index. Applies only to training stage. For other stages, it is default `None`.
1217        - **num_batches** (`int` | `None`): the total number of batches. Applies only to training stage. For other stages, it is default `None`.
1218        - **test_mask** (`dict[str, Tensor]` | `None`): the binary mask used for test. Applies only to testing stage. For other stages, it is default `None`.
1219
1220        **Returns:**
1221        - **output_feature** (`Tensor`): the output feature tensor to be passed to the heads.
1222        - **mask** (`dict[str, Tensor]`): the mask for the current task. Key (`str`) is layer name, value (`Tensor`) is the mask tensor. The mask tensor has size (number of units).
1223        - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited this `forward()` method of `HAT` class.
1224        """
1225        batch_size = input.size(0)
1226        hidden_features = {}
1227
1228        # get the mask for the current task from the task embedding in this stage
1229        mask = self.get_mask(
1230            stage=stage,
1231            s_max=s_max,
1232            batch_idx=batch_idx,
1233            num_batches=num_batches,
1234            test_mask=test_mask,
1235        )
1236
1237        x = input
1238
1239        x = self.conv1(x)
1240
1241        x = x * (
1242            mask["conv1"].view(1, -1, 1, 1)
1243        )  # apply the mask to the 1st convolutional layer. Broadcast the dimension of mask to match the input
1244        if self.activation:
1245            x = self.conv_activation1(x)
1246        hidden_features["conv1"] = x
1247
1248        x = self.maxpool(x)
1249
1250        for block in self.conv2x:
1251            x, _, hidden_features_block = block(
1252                x,
1253                stage=stage,
1254                s_max=s_max,
1255                batch_idx=batch_idx,
1256                num_batches=num_batches,
1257                test_mask=test_mask,
1258            )
1259            hidden_features.update(hidden_features_block)  # store the hidden feature
1260        for block in self.conv3x:
1261            x, _, hidden_features_block = block(
1262                x,
1263                stage=stage,
1264                s_max=s_max,
1265                batch_idx=batch_idx,
1266                num_batches=num_batches,
1267                test_mask=test_mask,
1268            )
1269            hidden_features.update(hidden_features_block)  # store the hidden feature
1270        for block in self.conv4x:
1271            x, _, hidden_features_block = block(
1272                x,
1273                stage=stage,
1274                s_max=s_max,
1275                batch_idx=batch_idx,
1276                num_batches=num_batches,
1277                test_mask=test_mask,
1278            )
1279            hidden_features.update(hidden_features_block)  # store the hidden feature
1280        for block in self.conv5x:
1281            x, _, hidden_features_block = block(
1282                x,
1283                stage=stage,
1284                s_max=s_max,
1285                batch_idx=batch_idx,
1286                num_batches=num_batches,
1287                test_mask=test_mask,
1288            )
1289            hidden_features.update(hidden_features_block)  # store the hidden feature
1290
1291        x = self.avepool(x)
1292
1293        output_feature = x.view(batch_size, -1)  # flatten before going through heads
1294
1295        return output_feature, mask, hidden_features

The forward pass for data from task task_id. Task-specific mask for task_id are applied to the units which are channels in each weighted convolutional layer.

Args:

  • input (Tensor): the input tensor from data.
  • stage (str): the stage of the forward pass, should be one of the following:
    1. 'train': training stage.
    2. 'validation': validation stage.
    3. 'test': testing stage.
  • s_max (float | None): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in HAT paper.
  • batch_idx (int | None): the current batch index. Applies only to training stage. For other stages, it is default None.
  • num_batches (int | None): the total number of batches. Applies only to training stage. For other stages, it is default None.
  • test_mask (dict[str, Tensor] | None): the binary mask used for test. Applies only to testing stage. For other stages, it is default None.

Returns:

  • output_feature (Tensor): the output feature tensor to be passed to the heads.
  • mask (dict[str, Tensor]): the mask for the current task. Key (str) is layer name, value (Tensor) is the mask tensor. The mask tensor has size (number of units).
  • hidden_features (dict[str, Tensor]): the hidden features (after activation) in each weighted layer. Key (str) is the weighted layer name, value (Tensor) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited this forward() method of HAT class.
class HATMaskResNet18(HATMaskResNetBase):
1298class HATMaskResNet18(HATMaskResNetBase):
1299    r"""HAT masked ResNet-18 backbone network.
1300
1301    [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.
1302
1303    ResNet-18 is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 18 weight convolutional layers in total. See Table 1 in the paper for details.
1304
1305    Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
1306    """
1307
1308    def __init__(
1309        self,
1310        input_channels: int,
1311        output_dim: int,
1312        gate: str,
1313        activation_layer: nn.Module | None = nn.ReLU,
1314        bias: bool = False,
1315    ) -> None:
1316        r"""Construct and initialise the ResNet-18 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
1317
1318        **Args:**
1319        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
1320        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1321        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
1322            - `sigmoid`: the sigmoid function.
1323        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1324        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
1325        """
1326        HATMaskResNetBase.__init__(
1327            self,
1328            input_channels=input_channels,
1329            building_block_type=HATMaskResNetBlockSmall,  # use the smaller building block for ResNet-18
1330            building_block_nums=(2, 2, 2, 2),
1331            building_block_preceding_output_channels=(64, 64, 128, 256),
1332            building_block_input_channels=(64, 128, 256, 512),
1333            output_dim=output_dim,
1334            gate=gate,
1335            activation_layer=activation_layer,
1336            bias=bias,
1337        )

HAT masked ResNet-18 backbone network.

HAT (Hard Attention to the Task, 2018) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.

ResNet-18 is a smaller architecture proposed in the original ResNet paper. It consists of 18 weight convolutional layers in total. See Table 1 in the paper for details.

Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.

HATMaskResNet18( input_channels: int, output_dim: int, gate: str, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, bias: bool = False)
1308    def __init__(
1309        self,
1310        input_channels: int,
1311        output_dim: int,
1312        gate: str,
1313        activation_layer: nn.Module | None = nn.ReLU,
1314        bias: bool = False,
1315    ) -> None:
1316        r"""Construct and initialise the ResNet-18 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
1317
1318        **Args:**
1319        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
1320        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1321        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
1322            - `sigmoid`: the sigmoid function.
1323        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1324        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
1325        """
1326        HATMaskResNetBase.__init__(
1327            self,
1328            input_channels=input_channels,
1329            building_block_type=HATMaskResNetBlockSmall,  # use the smaller building block for ResNet-18
1330            building_block_nums=(2, 2, 2, 2),
1331            building_block_preceding_output_channels=(64, 64, 128, 256),
1332            building_block_input_channels=(64, 128, 256, 512),
1333            output_dim=output_dim,
1334            gate=gate,
1335            activation_layer=activation_layer,
1336            bias=bias,
1337        )

Construct and initialise the ResNet-18 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.

Args:

  • input_channels (int): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • gate (str): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
    • sigmoid: the sigmoid function.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • bias (bool): whether to use bias in the convolutional layer. Default False.
class HATMaskResNet34(HATMaskResNetBase):
1340class HATMaskResNet34(HATMaskResNetBase):
1341    r"""HAT masked ResNet-34 backbone network.
1342
1343    [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.
1344
1345    ResNet-34 is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 34 weight convolutional layers in total. See Table 1 in the paper for details.
1346
1347    Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
1348    """
1349
1350    def __init__(
1351        self,
1352        input_channels: int,
1353        output_dim: int,
1354        gate: str,
1355        activation_layer: nn.Module | None = nn.ReLU,
1356        bias: bool = False,
1357    ) -> None:
1358        r"""Construct and initialise the ResNet-34 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
1359
1360        **Args:**
1361        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
1362        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1363        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
1364            - `sigmoid`: the sigmoid function.
1365        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1366        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
1367        """
1368        HATMaskResNetBase.__init__(
1369            self,
1370            input_channels=input_channels,
1371            building_block_type=HATMaskResNetBlockSmall,  # use the smaller building block for ResNet-34
1372            building_block_nums=(3, 4, 6, 3),
1373            building_block_preceding_output_channels=(64, 64, 128, 256),
1374            building_block_input_channels=(64, 128, 256, 512),
1375            output_dim=output_dim,
1376            gate=gate,
1377            activation_layer=activation_layer,
1378            bias=bias,
1379        )

HAT masked ResNet-34 backbone network.

HAT (Hard Attention to the Task, 2018) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.

ResNet-34 is a smaller architecture proposed in the original ResNet paper. It consists of 34 weight convolutional layers in total. See Table 1 in the paper for details.

Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.

HATMaskResNet34( input_channels: int, output_dim: int, gate: str, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, bias: bool = False)
1350    def __init__(
1351        self,
1352        input_channels: int,
1353        output_dim: int,
1354        gate: str,
1355        activation_layer: nn.Module | None = nn.ReLU,
1356        bias: bool = False,
1357    ) -> None:
1358        r"""Construct and initialise the ResNet-34 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
1359
1360        **Args:**
1361        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
1362        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1363        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
1364            - `sigmoid`: the sigmoid function.
1365        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1366        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
1367        """
1368        HATMaskResNetBase.__init__(
1369            self,
1370            input_channels=input_channels,
1371            building_block_type=HATMaskResNetBlockSmall,  # use the smaller building block for ResNet-34
1372            building_block_nums=(3, 4, 6, 3),
1373            building_block_preceding_output_channels=(64, 64, 128, 256),
1374            building_block_input_channels=(64, 128, 256, 512),
1375            output_dim=output_dim,
1376            gate=gate,
1377            activation_layer=activation_layer,
1378            bias=bias,
1379        )

Construct and initialise the ResNet-34 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.

Args:

  • input_channels (int): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • gate (str): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
    • sigmoid: the sigmoid function.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • bias (bool): whether to use bias in the convolutional layer. Default False.
class HATMaskResNet50(HATMaskResNetBase):
1382class HATMaskResNet50(HATMaskResNetBase):
1383    r"""HAT masked ResNet-50 backbone network.
1384
1385    [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.
1386
1387    ResNet-50 is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 50 weight convolutional layers in total. See Table 1 in the paper for details.
1388
1389    Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
1390    """
1391
1392    def __init__(
1393        self,
1394        input_channels: int,
1395        output_dim: int,
1396        gate: str,
1397        activation_layer: nn.Module | None = nn.ReLU,
1398        bias: bool = False,
1399    ) -> None:
1400        r"""Construct and initialise the ResNet-50 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
1401
1402        **Args:**
1403        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
1404        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1405        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
1406            - `sigmoid`: the sigmoid function.
1407        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1408        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
1409        """
1410        HATMaskResNetBase.__init__(
1411            self,
1412            input_channels=input_channels,
1413            building_block_type=HATMaskResNetBlockLarge,  # use the smaller building block for ResNet-50
1414            building_block_nums=(3, 4, 6, 3),
1415            building_block_preceding_output_channels=(64, 256, 512, 1024),
1416            building_block_input_channels=(64, 128, 256, 512),
1417            output_dim=output_dim,
1418            gate=gate,
1419            activation_layer=activation_layer,
1420            bias=bias,
1421        )

HAT masked ResNet-50 backbone network.

HAT (Hard Attention to the Task, 2018) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.

ResNet-50 is a larger architecture proposed in the original ResNet paper. It consists of 50 weight convolutional layers in total. See Table 1 in the paper for details.

Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.

HATMaskResNet50( input_channels: int, output_dim: int, gate: str, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, bias: bool = False)
1392    def __init__(
1393        self,
1394        input_channels: int,
1395        output_dim: int,
1396        gate: str,
1397        activation_layer: nn.Module | None = nn.ReLU,
1398        bias: bool = False,
1399    ) -> None:
1400        r"""Construct and initialise the ResNet-50 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
1401
1402        **Args:**
1403        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
1404        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1405        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
1406            - `sigmoid`: the sigmoid function.
1407        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1408        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
1409        """
1410        HATMaskResNetBase.__init__(
1411            self,
1412            input_channels=input_channels,
1413            building_block_type=HATMaskResNetBlockLarge,  # use the smaller building block for ResNet-50
1414            building_block_nums=(3, 4, 6, 3),
1415            building_block_preceding_output_channels=(64, 256, 512, 1024),
1416            building_block_input_channels=(64, 128, 256, 512),
1417            output_dim=output_dim,
1418            gate=gate,
1419            activation_layer=activation_layer,
1420            bias=bias,
1421        )

Construct and initialise the ResNet-50 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.

Args:

  • input_channels (int): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • gate (str): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
    • sigmoid: the sigmoid function.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • bias (bool): whether to use bias in the convolutional layer. Default False.
class HATMaskResNet101(HATMaskResNetBase):
1424class HATMaskResNet101(HATMaskResNetBase):
1425    r"""HAT masked ResNet-101 backbone network.
1426
1427    [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.
1428
1429    ResNet-101 is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 101 weight convolutional layers in total. See Table 1 in the paper for details.
1430
1431    Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
1432    """
1433
1434    def __init__(
1435        self,
1436        input_channels: int,
1437        output_dim: int,
1438        gate: str,
1439        activation_layer: nn.Module | None = nn.ReLU,
1440        bias: bool = False,
1441    ) -> None:
1442        r"""Construct and initialise the ResNet-101 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
1443
1444        **Args:**
1445        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
1446        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1447        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
1448            - `sigmoid`: the sigmoid function.
1449        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1450        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
1451        """
1452        HATMaskResNetBase.__init__(
1453            self,
1454            input_channels=input_channels,
1455            building_block_type=HATMaskResNetBlockLarge,  # use the smaller building block for ResNet-18
1456            building_block_nums=(3, 4, 23, 3),
1457            building_block_preceding_output_channels=(64, 256, 512, 1024),
1458            building_block_input_channels=(64, 128, 256, 512),
1459            output_dim=output_dim,
1460            gate=gate,
1461            activation_layer=activation_layer,
1462            bias=bias,
1463        )

HAT masked ResNet-101 backbone network.

HAT (Hard Attention to the Task, 2018) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.

ResNet-101 is a larger architecture proposed in the original ResNet paper. It consists of 101 weight convolutional layers in total. See Table 1 in the paper for details.

Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.

HATMaskResNet101( input_channels: int, output_dim: int, gate: str, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, bias: bool = False)
1434    def __init__(
1435        self,
1436        input_channels: int,
1437        output_dim: int,
1438        gate: str,
1439        activation_layer: nn.Module | None = nn.ReLU,
1440        bias: bool = False,
1441    ) -> None:
1442        r"""Construct and initialise the ResNet-101 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
1443
1444        **Args:**
1445        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
1446        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1447        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
1448            - `sigmoid`: the sigmoid function.
1449        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1450        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
1451        """
1452        HATMaskResNetBase.__init__(
1453            self,
1454            input_channels=input_channels,
1455            building_block_type=HATMaskResNetBlockLarge,  # use the smaller building block for ResNet-18
1456            building_block_nums=(3, 4, 23, 3),
1457            building_block_preceding_output_channels=(64, 256, 512, 1024),
1458            building_block_input_channels=(64, 128, 256, 512),
1459            output_dim=output_dim,
1460            gate=gate,
1461            activation_layer=activation_layer,
1462            bias=bias,
1463        )

Construct and initialise the ResNet-101 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.

Args:

  • input_channels (int): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • gate (str): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
    • sigmoid: the sigmoid function.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • bias (bool): whether to use bias in the convolutional layer. Default False.
class HATMaskResNet152(HATMaskResNetBase):
1466class HATMaskResNet152(HATMaskResNetBase):
1467    r"""HAT masked ResNet-152 backbone network.
1468
1469    [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.
1470
1471    ResNet-152 is the largest architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 152 weight convolutional layers in total. See Table 1 in the paper for details.
1472
1473    Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
1474    """
1475
1476    def __init__(
1477        self,
1478        input_channels: int,
1479        output_dim: int,
1480        gate: str,
1481        activation_layer: nn.Module | None = nn.ReLU,
1482        bias: bool = False,
1483    ) -> None:
1484        r"""Construct and initialise the ResNet-152 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
1485
1486        **Args:**
1487        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
1488        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1489        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
1490            - `sigmoid`: the sigmoid function.
1491        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1492        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
1493        """
1494        HATMaskResNetBase.__init__(
1495            self,
1496            input_channels=input_channels,
1497            building_block_type=HATMaskResNetBlockLarge,  # use the smaller building block for ResNet-152
1498            building_block_nums=(3, 8, 36, 3),
1499            building_block_preceding_output_channels=(64, 256, 512, 1024),
1500            building_block_input_channels=(64, 128, 256, 512),
1501            output_dim=output_dim,
1502            gate=gate,
1503            activation_layer=activation_layer,
1504            bias=bias,
1505        )

HAT masked ResNet-152 backbone network.

HAT (Hard Attention to the Task, 2018) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.

ResNet-152 is the largest architecture proposed in the original ResNet paper. It consists of 152 weight convolutional layers in total. See Table 1 in the paper for details.

Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.

HATMaskResNet152( input_channels: int, output_dim: int, gate: str, activation_layer: torch.nn.modules.module.Module | None = <class 'torch.nn.modules.activation.ReLU'>, bias: bool = False)
1476    def __init__(
1477        self,
1478        input_channels: int,
1479        output_dim: int,
1480        gate: str,
1481        activation_layer: nn.Module | None = nn.ReLU,
1482        bias: bool = False,
1483    ) -> None:
1484        r"""Construct and initialise the ResNet-152 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
1485
1486        **Args:**
1487        - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
1488        - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
1489        - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
1490            - `sigmoid`: the sigmoid function.
1491        - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`.
1492        - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`.
1493        """
1494        HATMaskResNetBase.__init__(
1495            self,
1496            input_channels=input_channels,
1497            building_block_type=HATMaskResNetBlockLarge,  # use the smaller building block for ResNet-152
1498            building_block_nums=(3, 8, 36, 3),
1499            building_block_preceding_output_channels=(64, 256, 512, 1024),
1500            building_block_input_channels=(64, 128, 256, 512),
1501            output_dim=output_dim,
1502            gate=gate,
1503            activation_layer=activation_layer,
1504            bias=bias,
1505        )

Construct and initialise the ResNet-152 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.

Args:

  • input_channels (int): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension.
  • output_dim (int): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads.
  • gate (str): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:
    • sigmoid: the sigmoid function.
  • activation_layer (nn.Module): activation function of each layer (if not None), if None this layer won't be used. Default nn.ReLU.
  • bias (bool): whether to use bias in the convolutional layer. Default False.