clarena.backbones.resnet
The submodule in backbones
for ResNet backbone network.
1r""" 2The submodule in `backbones` for ResNet backbone network. 3""" 4 5__all__ = [ 6 "ResNetBlockSmall", 7 "ResNetBlockLarge", 8 "ResNetBase", 9 "ResNet18", 10 "ResNet34", 11 "ResNet50", 12 "ResNet101", 13 "ResNet152", 14 "HATMaskResNetBlockSmall", 15 "HATMaskResNetBlockLarge", 16 "HATMaskResNetBase", 17 "HATMaskResNet18", 18 "HATMaskResNet34", 19 "HATMaskResNet50", 20 "HATMaskResNet101", 21 "HATMaskResNet152", 22] 23 24 25from torch import Tensor, nn 26 27from clarena.backbones import CLBackbone, HATMaskBackbone 28 29 30class ResNetBlockSmall(CLBackbone): 31 r"""The smaller building block for ResNet-18/34. 32 33 It consists of 2 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (left) in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). 34 """ 35 36 def __init__( 37 self, 38 outer_layer_name: str, 39 block_idx: int, 40 preceding_output_channels: int, 41 input_channels: int, 42 overall_stride: int, 43 activation_layer: nn.Module | None = nn.ReLU, 44 batch_normalisation: bool = True, 45 bias: bool = False, 46 ) -> None: 47 r"""Construct and initialise the smaller building block. 48 49 **Args:** 50 - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer. 51 - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer. 52 - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block. 53 - **input_channels** (`int`): the number of channels of input of this building block. 54 - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (last) convolutional layer where the 1st convolutional layer remain stride of 1. 55 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 56 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 57 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 58 """ 59 CLBackbone.__init__(self, output_dim=None) 60 61 self.batch_normalisation: bool = batch_normalisation 62 r"""Store whether to use batch normalisation after the fully-connected layers.""" 63 self.activation: bool = activation_layer is not None 64 r"""Store whether to use activation function after the fully-connected layers.""" 65 66 self.full_1st_layer_name = f"{outer_layer_name}/{block_idx}/conv1" 67 r"""Format and store full name of the 1st weighted convolutional layer. """ 68 self.full_2nd_layer_name = f"{outer_layer_name}/{block_idx}/conv2" 69 r"""Format and store full name of the 2nd weighted convolutional layer. """ 70 71 # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc) 72 layer_input_channels = preceding_output_channels # the input channels of the 1st convolutional layer, which receive the output channels of the preceding module 73 layer_output_channels = ( 74 input_channels # the output channels of the 1st convolutional layer 75 ) 76 self.conv1 = nn.Conv2d( 77 in_channels=layer_input_channels, 78 out_channels=layer_output_channels, 79 kernel_size=3, 80 stride=1, 81 padding=1, 82 bias=bias, 83 ) # construct the 1st weight convolutional layer of the smaller building block. Overall stride is not performed here 84 r"""The 1st weight convolutional layer of the smaller building block. """ 85 self.weighted_layer_names.append( 86 self.full_1st_layer_name 87 ) # update the weighted layer names 88 if self.batch_normalisation: 89 self.conv_bn1 = nn.BatchNorm2d( 90 num_features=layer_output_channels 91 ) # construct the batch normalisation layer 92 r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """ 93 if self.activation: 94 self.conv_activation1 = activation_layer() # construct the activation layer 95 r"""The activation layer after the 1st weighted convolutional layer. """ 96 97 # construct the 2nd weighted convolutional layer and attached layers (batchnorm, activation, etc) 98 layer_input_channels = input_channels # the input channels of the 2nd convolutional layer, which is `input_channels`, the same as the output channels of the 1st convolutional layer 99 layer_output_channels = ( 100 input_channels * 1 101 ) # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion) 102 self.conv2 = nn.Conv2d( 103 in_channels=layer_input_channels, 104 out_channels=layer_output_channels, 105 kernel_size=3, 106 stride=overall_stride, 107 padding=1, 108 bias=bias, 109 ) # construct the 2nd weight convolutional layer of the smaller building block. Overall stride is performed here 110 r"""The 2nd weight convolutional layer of the smaller building block. """ 111 self.weighted_layer_names.append( 112 self.full_2nd_layer_name 113 ) # update the weighted layer names 114 if batch_normalisation: 115 self.conv_bn2 = nn.BatchNorm2d( 116 num_features=layer_output_channels 117 ) # construct the batch normalisation layer 118 r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 2nd weighted convolutional layer. """ 119 if self.activation: 120 self.conv_activation2 = activation_layer() # construct the activation layer 121 r"""The activation layer after the 2nd weighted convolutional layer. """ 122 123 self.identity_downsample: nn.Module = ( 124 nn.Conv2d( 125 in_channels=preceding_output_channels, 126 out_channels=input_channels, 127 kernel_size=1, 128 stride=overall_stride, 129 bias=False, 130 ) 131 if preceding_output_channels != input_channels or overall_stride != 1 132 else None 133 ) # construct the identity downsample function 134 r"""The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists. """ 135 136 def forward(self, input: Tensor) -> tuple[Tensor, dict[str, Tensor]]: 137 r"""The forward pass for data. 138 139 **Args:** 140 - **input** (`Tensor`): the input feature maps. 141 142 **Returns:** 143 - **output_feature** (`Tensor`): the output feature maps. 144 - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. 145 """ 146 hidden_features = {} 147 148 identity = ( 149 self.identity_downsample(input) 150 if self.identity_downsample is not None 151 else input 152 ) # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's 153 154 x = input 155 x = self.conv1(x) 156 if self.batch_normalisation: 157 x = self.conv_bn1(x) 158 if self.activation: 159 x = self.conv_activation1(x) 160 hidden_features[self.full_1st_layer_name] = x # store the hidden feature 161 162 x = self.conv2(x) 163 if self.batch_normalisation: 164 x = self.conv_bn2(x) 165 166 x = x + identity 167 if self.activation: 168 x = self.conv_activation2(x) # activation after the shortcut connection 169 hidden_features[self.full_2nd_layer_name] = x # store the hidden feature 170 171 output_feature = x 172 173 return output_feature, hidden_features 174 175 176class ResNetBlockLarge(CLBackbone): 177 r"""The larger building block for ResNet-50/101/152. It is referred to "bottleneck" building block in the paper. 178 179 It consists of 3 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (right) in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). 180 """ 181 182 def __init__( 183 self, 184 outer_layer_name: str, 185 block_idx: int, 186 preceding_output_channels: int, 187 input_channels: int, 188 overall_stride: int, 189 activation_layer: nn.Module | None = nn.ReLU, 190 batch_normalisation: bool = True, 191 bias: bool = False, 192 ) -> None: 193 r"""Construct and initialise the larger building block. 194 195 **Args:** 196 - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer. 197 - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer. 198 - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block. 199 - **input_channels** (`int`): the number of channels of input of this building block. 200 - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (middle) convolutional layer where 1st and 3rd convolutional layers remain stride of 1. 201 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 202 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 203 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 204 """ 205 CLBackbone.__init__(self, output_dim=None) 206 207 self.batch_normalisation: bool = batch_normalisation 208 r"""Store whether to use batch normalisation after the fully-connected layers.""" 209 self.activation: bool = activation_layer is not None 210 r"""Store whether to use activation function after the fully-connected layers.""" 211 212 self.full_1st_layer_name = f"{outer_layer_name}/{block_idx}/conv1" 213 r"""Format and store full name of the 1st weighted convolutional layer. """ 214 self.full_2nd_layer_name = f"{outer_layer_name}_{block_idx}_conv2" 215 r"""Format and store full name of the 2nd weighted convolutional layer. """ 216 self.full_3rd_layer_name = f"{outer_layer_name}_{block_idx}_conv3" 217 r"""Format and store full name of the 3rd weighted convolutional layer. """ 218 219 # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc) 220 layer_input_channels = preceding_output_channels # the input channels of the 1st convolutional layer, which receive the output channels of the preceding module 221 layer_output_channels = ( 222 input_channels # the output channels of the 1st convolutional layer 223 ) 224 self.conv1 = nn.Conv2d( 225 in_channels=layer_input_channels, 226 out_channels=layer_output_channels, 227 kernel_size=1, 228 stride=1, 229 padding=0, 230 bias=bias, 231 ) # construct the 1st weight convolutional layer of the larger building block. Overall stride is not performed here 232 r"""The 1st weight convolutional layer of the larger building block. """ 233 self.weighted_layer_names.append( 234 self.full_1st_layer_name 235 ) # update the weighted layer names 236 if self.batch_normalisation: 237 self.conv_bn1 = nn.BatchNorm2d( 238 num_features=layer_output_channels 239 ) # construct the batch normalisation layer 240 r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """ 241 if self.activation: 242 self.conv_activation1 = activation_layer() # construct the activation layer 243 r"""The activation layer after the 1st weighted convolutional layer. """ 244 245 # construct the 2nd weighted convolutional layer and attached layers (batchnorm, activation, etc) 246 layer_input_channels = input_channels # the input channels of the 2nd convolutional layer, which is `input_channels`, the same as the output channels of the 1st convolutional layer 247 layer_output_channels = ( 248 input_channels 249 * 1 # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion) 250 ) 251 self.conv2 = nn.Conv2d( 252 in_channels=layer_input_channels, 253 out_channels=layer_output_channels, 254 kernel_size=3, 255 stride=overall_stride, 256 padding=1, 257 bias=bias, 258 ) # construct the 2nd weight convolutional layer of the larger building block. Overall stride is performed here 259 r"""The 2nd weight convolutional layer of the larger building block. """ 260 self.weighted_layer_names.append( 261 self.full_2nd_layer_name 262 ) # update the weighted layer names 263 if self.batch_normalisation: 264 self.conv_bn2 = nn.BatchNorm2d( 265 num_features=layer_output_channels 266 ) # construct the batch normalisation layer 267 r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 2nd weighted convolutional layer. """ 268 if self.activation: 269 self.conv_activation2 = activation_layer() # construct the activation layer 270 r"""The activation layer after the 2nd weighted convolutional layer. """ 271 272 # construct the 3rd weighted convolutional layer and attached layers (batchnorm, activation, etc) 273 layer_input_channels = ( 274 input_channels * 1 275 ) # the input channels of the 2nd convolutional layer, which is `input_channels * 1`, the same as the output channels of the 1st convolutional layer 276 layer_output_channels = ( 277 input_channels 278 * 4 # the output channels of the 2nd convolutional layer, which is 4 times expanded as the input channels 279 ) 280 self.conv3 = nn.Conv2d( 281 in_channels=layer_input_channels, 282 out_channels=layer_output_channels, 283 kernel_size=1, 284 stride=1, 285 padding=0, 286 bias=bias, 287 ) # construct the 3rd weight convolutional layer of the larger building block. Overall stride is not performed here 288 r"""The 3rd weight convolutional layer of the larger building block. """ 289 self.weighted_layer_names.append( 290 self.full_3rd_layer_name 291 ) # update the weighted layer names 292 if batch_normalisation: 293 self.conv_bn3 = nn.BatchNorm2d( 294 num_features=layer_output_channels 295 ) # construct the batch normalisation layer 296 r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 3rd weighted convolutional layer. """ 297 if self.activation: 298 self.conv_activation3 = activation_layer() # construct the activation layer 299 r"""The activation layer after the 3rd weighted convolutional layer. """ 300 301 self.identity_downsample: nn.Module = ( 302 nn.Conv2d( 303 in_channels=preceding_output_channels, 304 out_channels=input_channels * 4, 305 kernel_size=1, 306 stride=overall_stride, 307 bias=False, 308 ) 309 if preceding_output_channels != input_channels * 4 or overall_stride != 1 310 else None 311 ) 312 r"""The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists. """ 313 314 def forward(self, input: Tensor) -> tuple[Tensor, dict[str, Tensor]]: 315 r"""The forward pass for data. 316 317 **Args:** 318 - **input** (`Tensor`): the input feature maps. 319 320 **Returns:** 321 - **output_feature** (`Tensor`): the output feature maps. 322 - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. 323 """ 324 hidden_features = {} 325 326 identity = ( 327 self.identity_downsample(input) 328 if self.identity_downsample is not None 329 else input 330 ) # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's 331 332 x = input 333 x = self.conv1(x) 334 if self.batch_normalisation: 335 x = self.conv_bn1(x) 336 if self.activation: 337 x = self.conv_activation1(x) 338 hidden_features[self.full_1st_layer_name] = x # store the hidden feature 339 340 x = self.conv2(x) 341 if self.batch_normalisation: 342 x = self.conv_bn2(x) 343 if self.activation: 344 x = self.conv_activation2(x) 345 hidden_features[self.full_2nd_layer_name] = x # store the hidden feature 346 347 x = self.conv3(x) 348 if self.batch_normalisation: 349 x = self.conv_bn3(x) 350 351 x = x + identity 352 if self.activation: 353 x = self.conv_activation3(x) # activation after the shortcut connection 354 hidden_features[self.full_3rd_layer_name] = x # store the hidden feature 355 356 output_feature = x 357 358 return output_feature, hidden_features 359 360 361class ResNetBase(CLBackbone): 362 r"""The base class of [residual network (ResNet)](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). 363 364 ResNet is a convolutional network architecture, which has 1st convolutional parameter layer and a maxpooling layer, connecting to 4 convolutional layers which contains multiple convolutional parameter layer. Each layer of the 4 are constructed from basic building blocks which are either small (`ResNetBlockSmall`) or large (`ResNetBlockLarge`). Each building block contains several convolutional parameter layers. The building blocks are connected by a skip connection which is a direct connection from the input of the block to the output of the block, and this is why it's called residual (find "shortcut connections" in the paper for more details). After the 5th convolutional layer, there are average pooling layer and a fully connected layer which connects to the CL output heads. 365 """ 366 367 def __init__( 368 self, 369 input_channels: int, 370 building_block_type: ResNetBlockSmall | ResNetBlockLarge, 371 building_block_nums: tuple[int, int, int, int], 372 building_block_preceding_output_channels: tuple[int, int, int, int], 373 building_block_input_channels: tuple[int, int, int, int], 374 output_dim: int, 375 activation_layer: nn.Module | None = nn.ReLU, 376 batch_normalisation: bool = True, 377 bias: bool = False, 378 ) -> None: 379 r"""Construct and initialise the ResNet backbone network. 380 381 **Args:** 382 - **input_channels** (`int`): the number of channels of input. Image data are kept channels when going in ResNet. Note that convolutional networks require number of input channels instead of dimension. 383 - **building_block_type** (`ResNetBlockSmall` | `ResNetBlockLarge`): the type of building block used in the ResNet. 384 - **building_block_nums** (`tuple[int, int, int, int]`): the number of building blocks in the 2-5 convolutional layer correspondingly. 385 - **building_block_preceding_output_channels** (`tuple[int, int, int, int]`): the number of channels of preceding output of each building block in the 2-5 convolutional layer correspondingly. 386 - **building_block_input_channels** (`tuple[int, int, int, int]`): the number of channels of input of each building block in the 2-5 convolutional layer correspondingly. 387 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 388 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 389 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 390 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 391 """ 392 CLBackbone.__init__(self, output_dim=output_dim) 393 394 self.batch_normalisation: bool = batch_normalisation 395 r"""Store whether to use batch normalisation after the fully-connected layers.""" 396 self.activation: bool = activation_layer is not None 397 r"""Store whether to use activation function after the fully-connected layers.""" 398 399 # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc) 400 layer_input_channels = input_channels # the input channels of the 1st convolutional layer, which receive the input of the entire network 401 layer_output_channels = 64 # the output channels of the 1st convolutional layer 402 self.conv1 = nn.Conv2d( 403 in_channels=layer_input_channels, 404 out_channels=layer_output_channels, 405 kernel_size=7, 406 stride=2, 407 padding=3, 408 bias=bias, 409 ) # construct the 1st weight convolutional layer of the entire ResNet 410 r"""The 1st weight convolutional layer of the entire ResNet. It is always with fixed kernel size 7x7, stride 2, and padding 3. """ 411 self.weighted_layer_names.append("conv1") # collect the layer name to be masked 412 if self.batch_normalisation: 413 self.conv_bn1 = nn.BatchNorm2d( 414 num_features=layer_output_channels 415 ) # construct the batch normalisation layer 416 r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """ 417 if self.activation: 418 self.conv_activation1 = activation_layer() # construct the activation layer 419 r"""The activation layer after the 1st weighted convolutional layer. """ 420 421 self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) # 422 r"""The max pooling layer which is laid in between 1st and 2nd convolutional layers with kernel size 3x3, stride 2. """ 423 424 # construct the 2nd convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc) 425 self.conv2x = self._multiple_blocks( 426 layer_name="conv2x", 427 building_block_type=building_block_type, 428 building_block_num=building_block_nums[0], 429 preceding_output_channels=building_block_preceding_output_channels[0], 430 input_channels=building_block_input_channels[0], 431 overall_stride=1, # the overall stride of the 2nd convolutional layer should be 1, as the preceding maxpooling layer has stride 2, which already made 112x112 -> 56x56. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details. 432 activation_layer=activation_layer, 433 batch_normalisation=batch_normalisation, 434 bias=bias, 435 ) 436 r"""The 2nd convolutional layer of the ResNet, which contains multiple blocks. """ 437 438 # construct the 3rd convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc) 439 self.conv3x = self._multiple_blocks( 440 layer_name="conv3x", 441 building_block_type=building_block_type, 442 building_block_num=building_block_nums[1], 443 preceding_output_channels=building_block_preceding_output_channels[1], 444 input_channels=building_block_input_channels[1], 445 overall_stride=2, # the overall stride of the 3rd convolutional layer should be 2, making 56x56 -> 28x28. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details. 446 activation_layer=activation_layer, 447 batch_normalisation=batch_normalisation, 448 bias=bias, 449 ) 450 r"""The 3rd convolutional layer of the ResNet, which contains multiple blocks. """ 451 452 # construct the 4th convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc) 453 self.conv4x = self._multiple_blocks( 454 layer_name="conv4x", 455 building_block_type=building_block_type, 456 building_block_num=building_block_nums[2], 457 preceding_output_channels=building_block_preceding_output_channels[2], 458 input_channels=building_block_input_channels[2], 459 overall_stride=2, # the overall stride of the 4th convolutional layer should be 2, making 28x28 -> 14x14. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details. 460 activation_layer=activation_layer, 461 batch_normalisation=batch_normalisation, 462 bias=bias, 463 ) 464 r"""The 4th convolutional layer of the ResNet, which contains multiple blocks. """ 465 466 # construct the 5th convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc) 467 self.conv5x = self._multiple_blocks( 468 layer_name="conv5x", 469 building_block_type=building_block_type, 470 building_block_num=building_block_nums[3], 471 preceding_output_channels=building_block_preceding_output_channels[3], 472 input_channels=building_block_input_channels[3], 473 overall_stride=2, # the overall stride of the 2nd convolutional layer should be 2, making 14x14 -> 7x7. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details. 474 activation_layer=activation_layer, 475 batch_normalisation=batch_normalisation, 476 bias=bias, 477 ) 478 r"""The 5th convolutional layer of the ResNet, which contains multiple blocks. """ 479 480 self.avepool = nn.AdaptiveAvgPool2d(output_size=(1, 1)) 481 r"""The average pooling layer which is laid after the convolutional layers and before feature maps are flattened. """ 482 483 def _multiple_blocks( 484 self, 485 layer_name: str, 486 building_block_type: ResNetBlockSmall | ResNetBlockLarge, 487 building_block_num: int, 488 preceding_output_channels: int, 489 input_channels: int, 490 overall_stride: int, 491 activation_layer: nn.Module | None = nn.ReLU, 492 batch_normalisation: bool = True, 493 bias: bool = False, 494 ) -> nn.Sequential: 495 r"""Construct a layer consisting of multiple building blocks. It's used to construct the 2-5 convolutional layers of the ResNet. 496 497 The "shortcut connections" are performed between the input and output of each building block: 498 1. If the input and output of the building block have exactly the same dimensions (including number of channels and size), add the input to the output. 499 2. If the input and output of the building block have different dimensions (including number of channels and size), add the input to the output after a convolutional layer to make the dimensions match. 500 501 **Args:** 502 - **layer_name** (`str`): pass the name of this multi-building-block layer to construct the full name of each weighted convolutional layer. 503 - **building_block_type** (`ResNetBlockSmall` | `ResNetBlockLarge`): the type of the building block. 504 - **building_block_num** (`int`): the number of building blocks in this multi-building-block layer. 505 - **preceding_output_channels** (`int`): the number of channels of preceding output of this entire multi-building-block layer. 506 - **input_channels** (`int`): the number of channels of input of this multi-building-block layer. 507 - **overall_stride** (`int`): the overall stride of the building blocks. This stride is performed at the 1st building block where other building blocks remain their own overall stride of 1. Inside that building block, this stride is performed at certain convolutional layer in the building block where other convolutional layers remain stride of 1: 508 - For `ResNetBlockSmall`, it performs at the 2nd (last) layer. 509 - For `ResNetBlockLarge`, it performs at the 2nd (middle) layer. 510 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 511 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 512 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 513 514 **Returns:** 515 - **layer** (`nn.Sequential`): the constructed layer consisting of multiple building blocks. 516 """ 517 518 layer = [] 519 520 for block_idx in range(building_block_num): 521 layer.append( 522 building_block_type( 523 outer_layer_name=layer_name, 524 block_idx=block_idx, 525 preceding_output_channels=( 526 preceding_output_channels 527 if block_idx == 0 528 else ( 529 input_channels 530 if building_block_type == ResNetBlockSmall 531 else input_channels * 4 532 ) 533 ), # if it's the 1st block in this multi-building-block layer, it should be the number of channels of the preceding output of this entire multi-building-block layer. Otherwise, it should be the number of channels from last building block where the number of channels is 4 times expanded as the input channels for `ResNetBlockLarge` than `ResNetBlockSmall`. 534 input_channels=input_channels, 535 overall_stride=( 536 overall_stride if block_idx == 0 else 1 537 ), # only perform the overall stride at the 1st block in this multi-building-block layer 538 activation_layer=activation_layer, 539 batch_normalisation=batch_normalisation, 540 bias=bias, 541 ) 542 ) 543 544 self.weighted_layer_names += layer[ 545 -1 546 ].weighted_layer_names # collect the weighted layer names in the blocks and sync to the weighted layer names list in the outer network 547 548 return nn.Sequential(*layer) 549 550 def forward( 551 self, input: Tensor, stage: str = None, task_id: int | None = None 552 ) -> tuple[Tensor, dict[str, Tensor]]: 553 r"""The forward pass for data. It is the same for all tasks. 554 555 **Args:** 556 - **input** (`Tensor`): the input tensor from data. 557 558 **Returns:** 559 - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation. 560 - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. 561 """ 562 batch_size = input.size(0) 563 hidden_features = {} 564 565 x = input 566 567 x = self.conv1(x) 568 if self.batch_normalisation: 569 x = self.conv_bn1(x) 570 if self.activation: 571 x = self.conv_activation1(x) 572 hidden_features["conv1"] = x 573 574 x = self.maxpool(x) 575 576 for block in self.conv2x: 577 x, hidden_features_block = block(x) 578 hidden_features.update(hidden_features_block) # store the hidden feature 579 for block in self.conv3x: 580 x, hidden_features_block = block(x) 581 hidden_features.update(hidden_features_block) # store the hidden feature 582 for block in self.conv4x: 583 x, hidden_features_block = block(x) 584 hidden_features.update(hidden_features_block) # store the hidden feature 585 for block in self.conv5x: 586 x, hidden_features_block = block(x) 587 hidden_features.update(hidden_features_block) # store the hidden feature 588 589 x = self.avepool(x) 590 591 output_feature = x.view(batch_size, -1) # flatten before going through heads 592 593 return output_feature, hidden_features 594 595 596class ResNet18(ResNetBase): 597 r"""ResNet-18 backbone network. 598 599 This is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 18 weight convolutional layers in total. See Table 1 in the paper for details. 600 """ 601 602 def __init__( 603 self, 604 input_channels: int, 605 output_dim: int, 606 activation_layer: nn.Module | None = nn.ReLU, 607 batch_normalisation: bool = True, 608 bias: bool = False, 609 ) -> None: 610 r"""Construct and initialise the ResNet-18 backbone network. 611 612 **Args:** 613 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 614 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 615 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 616 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 617 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 618 """ 619 ResNetBase.__init__( 620 self, 621 input_channels=input_channels, 622 building_block_type=ResNetBlockSmall, # use the smaller building block for ResNet-18 623 building_block_nums=(2, 2, 2, 2), 624 building_block_preceding_output_channels=(64, 64, 128, 256), 625 building_block_input_channels=(64, 128, 256, 512), 626 output_dim=output_dim, 627 activation_layer=activation_layer, 628 batch_normalisation=batch_normalisation, 629 bias=bias, 630 ) 631 632 633class ResNet34(ResNetBase): 634 r"""ResNet-34 backbone network. 635 636 This is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 34 weight convolutional layers in total. See Table 1 in the paper for details. 637 """ 638 639 def __init__( 640 self, 641 input_channels: int, 642 output_dim: int, 643 activation_layer: nn.Module | None = nn.ReLU, 644 batch_normalisation: bool = True, 645 bias: bool = False, 646 ) -> None: 647 r"""Construct and initialise the ResNet-34 backbone network. 648 649 **Args:** 650 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 651 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 652 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 653 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 654 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 655 """ 656 ResNetBase.__init__( 657 self, 658 input_channels=input_channels, 659 building_block_type=ResNetBlockSmall, # use the smaller building block for ResNet-34 660 building_block_nums=(3, 4, 6, 3), 661 building_block_preceding_output_channels=(64, 64, 128, 256), 662 building_block_input_channels=(64, 128, 256, 512), 663 output_dim=output_dim, 664 activation_layer=activation_layer, 665 batch_normalisation=batch_normalisation, 666 bias=bias, 667 ) 668 669 670class ResNet50(ResNetBase): 671 r"""ResNet-50 backbone network. 672 673 This is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 50 weight convolutional layers in total. See Table 1 in the paper for details. 674 """ 675 676 def __init__( 677 self, 678 input_channels: int, 679 output_dim: int, 680 activation_layer: nn.Module | None = nn.ReLU, 681 batch_normalisation: bool = True, 682 bias: bool = False, 683 ) -> None: 684 r"""Construct and initialise the ResNet-50 backbone network. 685 686 **Args:** 687 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 688 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 689 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 690 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 691 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 692 """ 693 ResNetBase.__init__( 694 self, 695 input_channels=input_channels, 696 building_block_type=ResNetBlockLarge, # use the larger building block for ResNet-50 697 building_block_nums=(3, 4, 6, 3), 698 building_block_preceding_output_channels=(64, 256, 512, 1024), 699 building_block_input_channels=(64, 128, 256, 512), 700 output_dim=output_dim, 701 activation_layer=activation_layer, 702 batch_normalisation=batch_normalisation, 703 bias=bias, 704 ) 705 706 707class ResNet101(ResNetBase): 708 r"""ResNet-101 backbone network. 709 710 This is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 101 weight convolutional layers in total. See Table 1 in the paper for details. 711 """ 712 713 def __init__( 714 self, 715 input_channels: int, 716 output_dim: int, 717 activation_layer: nn.Module | None = nn.ReLU, 718 batch_normalisation: bool = True, 719 bias: bool = False, 720 ) -> None: 721 r"""Construct and initialise the ResNet-101 backbone network. 722 723 **Args:** 724 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 725 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 726 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 727 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 728 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 729 """ 730 ResNetBase.__init__( 731 self, 732 input_channels=input_channels, 733 building_block_type=ResNetBlockLarge, # use the larger building block for ResNet-101 734 building_block_nums=(3, 4, 23, 3), 735 building_block_preceding_output_channels=(64, 256, 512, 1024), 736 building_block_input_channels=(64, 128, 256, 512), 737 output_dim=output_dim, 738 activation_layer=activation_layer, 739 batch_normalisation=batch_normalisation, 740 bias=bias, 741 ) 742 743 744class ResNet152(ResNetBase): 745 r"""ResNet-152 backbone network. 746 747 This is the largest architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 152 weight convolutional layers in total. See Table 1 in the paper for details. 748 """ 749 750 def __init__( 751 self, 752 input_channels: int, 753 output_dim: int, 754 activation_layer: nn.Module | None = nn.ReLU, 755 batch_normalisation: bool = True, 756 bias: bool = False, 757 ) -> None: 758 r"""Construct and initialise the ResNet-50 backbone network. 759 760 **Args:** 761 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 762 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 763 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 764 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 765 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 766 """ 767 ResNetBase.__init__( 768 self, 769 input_channels=input_channels, 770 building_block_type=ResNetBlockLarge, # use the larger building block for ResNet-152 771 building_block_nums=(3, 8, 36, 3), 772 building_block_preceding_output_channels=(64, 256, 512, 1024), 773 building_block_input_channels=(64, 128, 256, 512), 774 output_dim=output_dim, 775 activation_layer=activation_layer, 776 batch_normalisation=batch_normalisation, 777 bias=bias, 778 ) 779 780 781class HATMaskResNetBlockSmall(HATMaskBackbone, ResNetBlockSmall): 782 r"""The smaller building block for HAT masked ResNet-18/34. 783 784 It consists of 2 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (left) in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). 785 786 Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function. 787 """ 788 789 def __init__( 790 self, 791 outer_layer_name: str, 792 block_idx: int, 793 preceding_output_channels: int, 794 input_channels: int, 795 overall_stride: int, 796 gate: str, 797 activation_layer: nn.Module | None = nn.ReLU, 798 bias: bool = False, 799 ) -> None: 800 r"""Construct and initialise the smaller building block with task embedding. 801 802 **Args:** 803 - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer. 804 - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer. 805 - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block. 806 - **input_channels** (`int`): the number of channels of input of this building block. 807 - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (last) convolutional layer where the 1st convolutional layer remain stride of 1. 808 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 809 - `sigmoid`: the sigmoid function. 810 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 811 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 812 """ 813 HATMaskBackbone.__init__(self, output_dim=None, gate=gate) 814 ResNetBlockSmall.__init__( 815 self, 816 outer_layer_name=outer_layer_name, 817 block_idx=block_idx, 818 preceding_output_channels=preceding_output_channels, 819 input_channels=input_channels, 820 overall_stride=overall_stride, 821 activation_layer=activation_layer, 822 bias=bias, 823 ) 824 self.register_hat_mask_module_explicitly(gate=gate) 825 826 # construct the task embedding over the 1st weighted convolutional layer. It is channel-wise 827 layer_output_channels = ( 828 input_channels # the output channels of the 1st convolutional layer 829 ) 830 self.task_embedding_t[self.full_1st_layer_name] = nn.Embedding( 831 num_embeddings=1, embedding_dim=layer_output_channels 832 ) 833 834 # construct the task embedding over the 2nd weighted convolutional layer. It is channel-wise 835 layer_output_channels = ( 836 input_channels * 1 837 ) # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion) 838 self.task_embedding_t[self.full_2nd_layer_name] = nn.Embedding( 839 num_embeddings=1, embedding_dim=layer_output_channels 840 ) 841 842 def forward( 843 self, 844 input: Tensor, 845 stage: str, 846 s_max: float | None = None, 847 batch_idx: int | None = None, 848 num_batches: int | None = None, 849 test_mask: dict[str, Tensor] | None = None, 850 ) -> tuple[Tensor, dict[str, Tensor], dict[str, Tensor]]: 851 r"""The forward pass for data from task `task_id`. Task-specific mask for `task_id` are applied to the units which are channels in each weighted convolutional layer. 852 853 **Args:** 854 - **input** (`Tensor`): The input tensor from data. 855 - **stage** (`str`): the stage of the forward pass, should be one of the following: 856 1. 'train': training stage. 857 2. 'validation': validation stage. 858 3. 'test': testing stage. 859 - **s_max** (`float` | `None`): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in [HAT paper](http://proceedings.mlr.press/v80/serra18a). 860 - **batch_idx** (`int` | `None`): the current batch index. Applies only to training stage. For other stages, it is default `None`. 861 - **num_batches** (`int` | `None`): the total number of batches. Applies only to training stage. For other stages, it is default `None`. 862 - **test_mask** (`dict[str, Tensor]` | `None`): the binary mask used for test. Applies only to testing stage. For other stages, it is default `None`. 863 864 **Returns:** 865 - **output_feature** (`Tensor`): the output feature maps. 866 - **mask** (`dict[str, Tensor]`): the mask for the current task. Key (`str`) is layer name, value (`Tensor`) is the mask tensor. The mask tensor has size (number of units). 867 - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited this `forward()` method of `HAT` class. 868 """ 869 hidden_features = {} 870 871 # get the mask for the current task from the task embedding in this stage 872 mask = self.get_mask( 873 stage=stage, 874 s_max=s_max, 875 batch_idx=batch_idx, 876 num_batches=num_batches, 877 test_mask=test_mask, 878 ) 879 880 identity = ( 881 self.identity_downsample(input) 882 if self.identity_downsample is not None 883 else input 884 ) # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's 885 886 x = input 887 x = self.conv1(x) # weighted convolutional layer first 888 x = x * ( 889 mask[self.full_1st_layer_name].view(1, -1, 1, 1) 890 ) # apply the mask to the 1st convolutional layer. Broadcast the dimension of mask to match the input 891 if self.activation: 892 x = self.conv_activation1(x) # activation function third 893 hidden_features[self.full_1st_layer_name] = x # store the hidden feature 894 895 x = self.conv2(x) # weighted convolutional layer first 896 x = x + identity 897 x = x * ( 898 mask[self.full_2nd_layer_name].view(1, -1, 1, 1) 899 ) # apply the mask to the 2nd convolutional layer after the shortcut connection. Broadcast the dimension of mask to match the input 900 if self.activation: 901 x = self.conv_activation2(x) # activation after the shortcut connection 902 hidden_features[self.full_2nd_layer_name] = x # store the hidden feature 903 904 output_feature = x 905 906 return output_feature, mask, hidden_features 907 908 909class HATMaskResNetBlockLarge(HATMaskBackbone, ResNetBlockLarge): 910 r"""The larger building block for ResNet-50/101/152. It is referred to "bottleneck" building block in the ResNet paper. 911 912 It consists of 3 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (right) in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). 913 914 Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function. 915 """ 916 917 def __init__( 918 self, 919 outer_layer_name: str, 920 block_idx: int, 921 preceding_output_channels: int, 922 input_channels: int, 923 overall_stride: int, 924 gate: str, 925 activation_layer: nn.Module | None = nn.ReLU, 926 bias: bool = False, 927 ) -> None: 928 r"""Construct and initialise the larger building block with task embedding. 929 930 **Args:** 931 - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer. 932 - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer. 933 - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block. 934 - **input_channels** (`int`): the number of channels of input of this building block. 935 - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (middle) convolutional layer where 1st and 3rd convolutional layers remain stride of 1. 936 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 937 - `sigmoid`: the sigmoid function. 938 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 939 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 940 """ 941 HATMaskBackbone.__init__(self, output_dim=None, gate=gate) 942 ResNetBlockLarge.__init__( 943 self, 944 outer_layer_name=outer_layer_name, 945 block_idx=block_idx, 946 preceding_output_channels=preceding_output_channels, 947 input_channels=input_channels, 948 overall_stride=overall_stride, 949 activation_layer=activation_layer, 950 bias=bias, 951 ) 952 self.register_hat_mask_module_explicitly(gate=gate) 953 954 # construct the task embedding over the 1st weighted convolutional layer. It is channel-wise 955 layer_output_channels = ( 956 input_channels # the output channels of the 1st convolutional layer 957 ) 958 self.task_embedding_t[self.full_1st_layer_name] = nn.Embedding( 959 num_embeddings=1, embedding_dim=layer_output_channels 960 ) 961 962 # construct the task embedding over the 2nd weighted convolutional layer. It is channel-wise 963 layer_output_channels = ( 964 input_channels * 1 965 ) # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion) 966 self.task_embedding_t[self.full_2nd_layer_name] = nn.Embedding( 967 num_embeddings=1, embedding_dim=layer_output_channels 968 ) 969 970 # construct the task embedding over the 3rd weighted convolutional layer. It is channel-wise 971 layer_output_channels = ( 972 input_channels 973 * 4 # the output channels of the 2nd convolutional layer, which is 4 times expanded as the input channels 974 ) 975 self.task_embedding_t[self.full_3rd_layer_name] = nn.Embedding( 976 num_embeddings=1, embedding_dim=layer_output_channels 977 ) 978 979 def forward( 980 self, 981 input: Tensor, 982 stage: str, 983 s_max: float | None = None, 984 batch_idx: int | None = None, 985 num_batches: int | None = None, 986 test_mask: dict[str, Tensor] | None = None, 987 ) -> tuple[Tensor, dict[str, Tensor], dict[str, Tensor]]: 988 r"""The forward pass for data from task `task_id`. Task-specific mask for `task_id` are applied to the units which are channels in each weighted convolutional layer. 989 990 **Args:** 991 - **input** (`Tensor`): The input tensor from data. 992 - **stage** (`str`): the stage of the forward pass, should be one of the following: 993 1. 'train': training stage. 994 2. 'validation': validation stage. 995 3. 'test': testing stage. 996 - **s_max** (`float` | `None`): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in [HAT paper](http://proceedings.mlr.press/v80/serra18a). 997 - **batch_idx** (`int` | `None`): the current batch index. Applies only to training stage. For other stages, it is default `None`. 998 - **num_batches** (`int` | `None`): the total number of batches. Applies only to training stage. For other stages, it is default `None`. 999 - **test_mask** (`dict[str, Tensor]` | `None`): the binary mask used for test. Applies only to testing stage. For other stages, it is default `None`. 1000 1001 **Returns:** 1002 - **output_feature** (`Tensor`): the output feature maps. 1003 - **mask** (`dict[str, Tensor]`): the mask for the current task. Key (`str`) is layer name, value (`Tensor`) is the mask tensor. The mask tensor has size (number of units). 1004 - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited this `forward()` method of `HAT` class. 1005 """ 1006 hidden_features = {} 1007 1008 # get the mask for the current task from the task embedding in this stage 1009 mask = self.get_mask( 1010 stage=stage, 1011 s_max=s_max, 1012 batch_idx=batch_idx, 1013 num_batches=num_batches, 1014 test_mask=test_mask, 1015 ) 1016 1017 identity = ( 1018 self.identity_downsample(input) 1019 if self.identity_downsample is not None 1020 else input 1021 ) # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's 1022 1023 x = input 1024 x = self.conv1(x) # weighted convolutional layer first 1025 x = x * ( 1026 mask[self.full_1st_layer_name].view(1, -1, 1, 1) 1027 ) # apply the mask to the 1st convolutional layer. Broadcast the dimension of mask to match the input 1028 if self.activation: 1029 x = self.conv_activation1(x) # activation function third 1030 hidden_features[self.full_1st_layer_name] = x # store the hidden feature 1031 1032 x = self.conv2(x) # weighted convolutional layer first 1033 x = x * ( 1034 mask[self.full_2nd_layer_name].view(1, -1, 1, 1) 1035 ) # apply the mask to the 2nd convolutional layer. Broadcast the dimension of mask to match the input 1036 if self.activation: 1037 x = self.conv_activation2(x) # activation function third 1038 hidden_features[self.full_2nd_layer_name] = x # store the hidden feature 1039 1040 x = self.conv3(x) # weighted convolutional layer first 1041 x = x + identity 1042 x = x * ( 1043 mask[self.full_3rd_layer_name].view(1, -1, 1, 1) 1044 ) # apply the mask to the 3rd convolutional layer after the shortcut connection. Broadcast the dimension of mask to match the input 1045 if self.activation: 1046 x = self.activation3(x) # activation after the shortcut connection 1047 hidden_features[self.full_3rd_layer_name] = x # store the hidden feature 1048 1049 output_feature = x 1050 1051 return output_feature, mask, hidden_features 1052 1053 1054class HATMaskResNetBase(ResNetBase, HATMaskBackbone): 1055 r"""The base class of HAT masked [residual network (ResNet)](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). 1056 1057 [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters. 1058 1059 ResNet is a convolutional network architecture, which has 1st convolutional parameter layer and a maxpooling layer, connecting to 4 convolutional layers which contains multiple convolutional parameter layer. Each layer of the 4 are constructed from basic building blocks which are either small (`ResNetBlockSmall`) or large (`ResNetBlockLarge`). Each building block contains several convolutional parameter layers. The building blocks are connected by a skip connection which is a direct connection from the input of the block to the output of the block, and this is why it's called residual (find "shortcut connections" in the paper for more details). After the 5th convolutional layer, there are average pooling layer and a fully connected layer which connects to the CL output heads. 1060 1061 Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function. 1062 """ 1063 1064 def __init__( 1065 self, 1066 input_channels: int, 1067 building_block_type: HATMaskResNetBlockSmall | HATMaskResNetBlockLarge, 1068 building_block_nums: tuple[int, int, int, int], 1069 building_block_preceding_output_channels: tuple[int, int, int, int], 1070 building_block_input_channels: tuple[int, int, int, int], 1071 output_dim: int, 1072 gate: str, 1073 activation_layer: nn.Module | None = nn.ReLU, 1074 bias: bool = False, 1075 ) -> None: 1076 r"""Construct and initialise the HAT masked ResNet backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism. 1077 1078 **Args:** 1079 - **input_channels** (`int`): the number of channels of input. Image data are kept channels when going in ResNet. Note that convolutional networks require number of input channels instead of dimension. 1080 - **building_block_type** (`HATMaskResNetBlockSmall` | `HATMaskResNetBlockLarge`): the type of building block used in the ResNet. 1081 - **building_block_nums** (`tuple[int, int, int, int]`): the number of building blocks in the 2-5 convolutional layer correspondingly. 1082 - **building_block_preceding_output_channels** (`tuple[int, int, int, int]`): the number of channels of preceding output of each building block in the 2-5 convolutional layer correspondingly. 1083 - **building_block_input_channels** (`tuple[int, int, int, int]`): the number of channels of input of each building block in the 2-5 convolutional layer correspondingly. 1084 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 1085 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 1086 - `sigmoid`: the sigmoid function. 1087 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 1088 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 1089 """ 1090 # init from both inherited classes 1091 HATMaskBackbone.__init__(self, output_dim=output_dim, gate=gate) 1092 ResNetBase.__init__( 1093 self, 1094 input_channels=input_channels, 1095 building_block_type=building_block_type, 1096 building_block_nums=building_block_nums, 1097 building_block_preceding_output_channels=building_block_preceding_output_channels, 1098 building_block_input_channels=building_block_input_channels, 1099 output_dim=output_dim, 1100 activation_layer=activation_layer, 1101 batch_normalisation=False, # batch normalisation is incompatible with HAT mechanism 1102 bias=bias, 1103 ) 1104 self.register_hat_mask_module_explicitly( 1105 gate=gate 1106 ) # register all `nn.Module`s for HATMaskBackbone explicitly because the second `__init__()` wipes out them inited by the first `__init__()` 1107 self.update_multiple_blocks_task_embedding() 1108 1109 # construct the task embedding over the 1st weighted convolutional layers. It is channel-wise 1110 layer_output_channels = 64 # the output channels of the 1st convolutional layer 1111 self.task_embedding_t["conv1"] = nn.Embedding( 1112 num_embeddings=1, embedding_dim=layer_output_channels 1113 ) 1114 1115 def _multiple_blocks( 1116 self, 1117 layer_name: str, 1118 building_block_type: HATMaskResNetBlockSmall | HATMaskResNetBlockLarge, 1119 building_block_num: int, 1120 preceding_output_channels: int, 1121 input_channels: int, 1122 overall_stride: int, 1123 activation_layer: nn.Module | None = nn.ReLU, 1124 batch_normalisation: bool = False, 1125 bias: bool = False, 1126 ) -> None: 1127 r"""Construct a layer consisting of multiple building blocks with task embedding. It's used to construct the 2-5 convolutional layers of the HAT masked ResNet. 1128 1129 The "shortcut connections" are performed between the input and output of each building block: 1130 1. If the input and output of the building block have exactly the same dimensions (including number of channels and size), add the input to the output. 1131 2. If the input and output of the building block have different dimensions (including number of channels and size), add the input to the output after a convolutional layer to make the dimensions match. 1132 1133 **Args:** 1134 - **layer_name** (`str`): pass the name of this multi-building-block layer to construct the full name of each weighted convolutional layer. 1135 - **building_block_type** (`HATMaskResNetBlockSmall` | `HATMaskResNetBlockLarge`): the type of the building block. 1136 - **building_block_num** (`int`): the number of building blocks in this multi-building-block layer. 1137 - **preceding_output_channels** (`int`): the number of channels of preceding output of this entire multi-building-block layer. 1138 - **input_channels** (`int`): the number of channels of input of this multi-building-block layer. 1139 - **overall_stride** (`int`): the overall stride of the building blocks. This stride is performed at the 1st building block where other building blocks remain their own overall stride of 1. Inside that building block, this stride is performed at certain convolutional layer in the building block where other convolutional layers remain stride of 1: 1140 - For `ResNetBlockSmall`, it performs at the 2nd (last) layer. 1141 - For `ResNetBlockLarge`, it performs at the 2nd (middle) layer. 1142 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 1143 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. In HATMaskResNet, batch normalisation is incompatible with HAT mechanism and shoule be always set `False`. We include this argument for compatibility with the original ResNet API. 1144 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 1145 1146 **Returns:** 1147 - **layer** (`nn.Sequential`): the constructed layer consisting of multiple building blocks. 1148 """ 1149 1150 layer = [] 1151 1152 for block_idx in range(building_block_num): 1153 layer.append( 1154 building_block_type( 1155 outer_layer_name=layer_name, 1156 block_idx=block_idx, 1157 preceding_output_channels=( 1158 preceding_output_channels 1159 if block_idx == 0 1160 else ( 1161 input_channels 1162 if building_block_type == HATMaskResNetBlockSmall 1163 else input_channels * 4 1164 ) 1165 ), # if it's the 1st block in this multi-building-block layer, it should be the number of channels of the preceding output of this entire multi-building-block layer. Otherwise, it should be the number of channels from last building block where the number of channels is 4 times of the input channels for `ResNetBlockLarge` than `ResNetBlockSmall`. 1166 input_channels=input_channels, 1167 overall_stride=( 1168 overall_stride if block_idx == 0 else 1 1169 ), # only perform the overall stride at the 1st block in this multi-building-block layer 1170 gate=self.gate, 1171 # no batch normalisation in HAT masked blocks 1172 activation_layer=activation_layer, 1173 bias=bias, 1174 ) 1175 ) 1176 1177 self.weighted_layer_names += layer[ 1178 -1 1179 ].weighted_layer_names # collect the weighted layer names in the blocks and sync to the weighted layer names list in the outer network 1180 1181 return nn.Sequential(*layer) 1182 1183 def update_multiple_blocks_task_embedding(self) -> None: 1184 r"""Collect the task embeddings in the multiple building blocks (2-5 convolutional layers) and sync to the weighted layer names list in the outer network. 1185 1186 This should only be called explicitly after the `__init__()` method, just because task embedding as `nn.Module` instance was wiped out at the beginning of it. 1187 """ 1188 for block in self.conv2x: 1189 self.task_embedding_t.update(block.task_embedding_t) 1190 for block in self.conv3x: 1191 self.task_embedding_t.update(block.task_embedding_t) 1192 for block in self.conv4x: 1193 self.task_embedding_t.update(block.task_embedding_t) 1194 for block in self.conv5x: 1195 self.task_embedding_t.update(block.task_embedding_t) 1196 1197 def forward( 1198 self, 1199 input: Tensor, 1200 stage: str, 1201 s_max: float | None = None, 1202 batch_idx: int | None = None, 1203 num_batches: int | None = None, 1204 test_mask: dict[str, Tensor] | None = None, 1205 ) -> tuple[Tensor, dict[str, Tensor], dict[str, Tensor]]: 1206 r"""The forward pass for data from task `task_id`. Task-specific mask for `task_id` are applied to the units which are channels in each weighted convolutional layer. 1207 1208 **Args:** 1209 - **input** (`Tensor`): the input tensor from data. 1210 - **stage** (`str`): the stage of the forward pass, should be one of the following: 1211 1. 'train': training stage. 1212 2. 'validation': validation stage. 1213 3. 'test': testing stage. 1214 - **s_max** (`float` | `None`): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in [HAT paper](http://proceedings.mlr.press/v80/serra18a). 1215 - **batch_idx** (`int` | `None`): the current batch index. Applies only to training stage. For other stages, it is default `None`. 1216 - **num_batches** (`int` | `None`): the total number of batches. Applies only to training stage. For other stages, it is default `None`. 1217 - **test_mask** (`dict[str, Tensor]` | `None`): the binary mask used for test. Applies only to testing stage. For other stages, it is default `None`. 1218 1219 **Returns:** 1220 - **output_feature** (`Tensor`): the output feature tensor to be passed to the heads. 1221 - **mask** (`dict[str, Tensor]`): the mask for the current task. Key (`str`) is layer name, value (`Tensor`) is the mask tensor. The mask tensor has size (number of units). 1222 - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited this `forward()` method of `HAT` class. 1223 """ 1224 batch_size = input.size(0) 1225 hidden_features = {} 1226 1227 # get the mask for the current task from the task embedding in this stage 1228 mask = self.get_mask( 1229 stage=stage, 1230 s_max=s_max, 1231 batch_idx=batch_idx, 1232 num_batches=num_batches, 1233 test_mask=test_mask, 1234 ) 1235 1236 x = input 1237 1238 x = self.conv1(x) 1239 1240 x = x * ( 1241 mask["conv1"].view(1, -1, 1, 1) 1242 ) # apply the mask to the 1st convolutional layer. Broadcast the dimension of mask to match the input 1243 if self.activation: 1244 x = self.conv_activation1(x) 1245 hidden_features["conv1"] = x 1246 1247 x = self.maxpool(x) 1248 1249 for block in self.conv2x: 1250 x, _, hidden_features_block = block( 1251 x, 1252 stage=stage, 1253 s_max=s_max, 1254 batch_idx=batch_idx, 1255 num_batches=num_batches, 1256 test_mask=test_mask, 1257 ) 1258 hidden_features.update(hidden_features_block) # store the hidden feature 1259 for block in self.conv3x: 1260 x, _, hidden_features_block = block( 1261 x, 1262 stage=stage, 1263 s_max=s_max, 1264 batch_idx=batch_idx, 1265 num_batches=num_batches, 1266 test_mask=test_mask, 1267 ) 1268 hidden_features.update(hidden_features_block) # store the hidden feature 1269 for block in self.conv4x: 1270 x, _, hidden_features_block = block( 1271 x, 1272 stage=stage, 1273 s_max=s_max, 1274 batch_idx=batch_idx, 1275 num_batches=num_batches, 1276 test_mask=test_mask, 1277 ) 1278 hidden_features.update(hidden_features_block) # store the hidden feature 1279 for block in self.conv5x: 1280 x, _, hidden_features_block = block( 1281 x, 1282 stage=stage, 1283 s_max=s_max, 1284 batch_idx=batch_idx, 1285 num_batches=num_batches, 1286 test_mask=test_mask, 1287 ) 1288 hidden_features.update(hidden_features_block) # store the hidden feature 1289 1290 x = self.avepool(x) 1291 1292 output_feature = x.view(batch_size, -1) # flatten before going through heads 1293 1294 return output_feature, mask, hidden_features 1295 1296 1297class HATMaskResNet18(HATMaskResNetBase): 1298 r"""HAT masked ResNet-18 backbone network. 1299 1300 [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters. 1301 1302 ResNet-18 is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 18 weight convolutional layers in total. See Table 1 in the paper for details. 1303 1304 Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function. 1305 """ 1306 1307 def __init__( 1308 self, 1309 input_channels: int, 1310 output_dim: int, 1311 gate: str, 1312 activation_layer: nn.Module | None = nn.ReLU, 1313 bias: bool = False, 1314 ) -> None: 1315 r"""Construct and initialise the ResNet-18 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism. 1316 1317 **Args:** 1318 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 1319 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 1320 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 1321 - `sigmoid`: the sigmoid function. 1322 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 1323 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 1324 """ 1325 HATMaskResNetBase.__init__( 1326 self, 1327 input_channels=input_channels, 1328 building_block_type=HATMaskResNetBlockSmall, # use the smaller building block for ResNet-18 1329 building_block_nums=(2, 2, 2, 2), 1330 building_block_preceding_output_channels=(64, 64, 128, 256), 1331 building_block_input_channels=(64, 128, 256, 512), 1332 output_dim=output_dim, 1333 gate=gate, 1334 activation_layer=activation_layer, 1335 bias=bias, 1336 ) 1337 1338 1339class HATMaskResNet34(HATMaskResNetBase): 1340 r"""HAT masked ResNet-34 backbone network. 1341 1342 [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters. 1343 1344 ResNet-34 is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 34 weight convolutional layers in total. See Table 1 in the paper for details. 1345 1346 Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function. 1347 """ 1348 1349 def __init__( 1350 self, 1351 input_channels: int, 1352 output_dim: int, 1353 gate: str, 1354 activation_layer: nn.Module | None = nn.ReLU, 1355 bias: bool = False, 1356 ) -> None: 1357 r"""Construct and initialise the ResNet-34 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism. 1358 1359 **Args:** 1360 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 1361 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 1362 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 1363 - `sigmoid`: the sigmoid function. 1364 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 1365 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 1366 """ 1367 HATMaskResNetBase.__init__( 1368 self, 1369 input_channels=input_channels, 1370 building_block_type=HATMaskResNetBlockSmall, # use the smaller building block for ResNet-34 1371 building_block_nums=(3, 4, 6, 3), 1372 building_block_preceding_output_channels=(64, 64, 128, 256), 1373 building_block_input_channels=(64, 128, 256, 512), 1374 output_dim=output_dim, 1375 gate=gate, 1376 activation_layer=activation_layer, 1377 bias=bias, 1378 ) 1379 1380 1381class HATMaskResNet50(HATMaskResNetBase): 1382 r"""HAT masked ResNet-50 backbone network. 1383 1384 [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters. 1385 1386 ResNet-50 is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 50 weight convolutional layers in total. See Table 1 in the paper for details. 1387 1388 Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function. 1389 """ 1390 1391 def __init__( 1392 self, 1393 input_channels: int, 1394 output_dim: int, 1395 gate: str, 1396 activation_layer: nn.Module | None = nn.ReLU, 1397 bias: bool = False, 1398 ) -> None: 1399 r"""Construct and initialise the ResNet-50 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism. 1400 1401 **Args:** 1402 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 1403 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 1404 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 1405 - `sigmoid`: the sigmoid function. 1406 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 1407 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 1408 """ 1409 HATMaskResNetBase.__init__( 1410 self, 1411 input_channels=input_channels, 1412 building_block_type=HATMaskResNetBlockLarge, # use the smaller building block for ResNet-50 1413 building_block_nums=(3, 4, 6, 3), 1414 building_block_preceding_output_channels=(64, 256, 512, 1024), 1415 building_block_input_channels=(64, 128, 256, 512), 1416 output_dim=output_dim, 1417 gate=gate, 1418 activation_layer=activation_layer, 1419 bias=bias, 1420 ) 1421 1422 1423class HATMaskResNet101(HATMaskResNetBase): 1424 r"""HAT masked ResNet-101 backbone network. 1425 1426 [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters. 1427 1428 ResNet-101 is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 101 weight convolutional layers in total. See Table 1 in the paper for details. 1429 1430 Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function. 1431 """ 1432 1433 def __init__( 1434 self, 1435 input_channels: int, 1436 output_dim: int, 1437 gate: str, 1438 activation_layer: nn.Module | None = nn.ReLU, 1439 bias: bool = False, 1440 ) -> None: 1441 r"""Construct and initialise the ResNet-101 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism. 1442 1443 **Args:** 1444 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 1445 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 1446 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 1447 - `sigmoid`: the sigmoid function. 1448 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 1449 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 1450 """ 1451 HATMaskResNetBase.__init__( 1452 self, 1453 input_channels=input_channels, 1454 building_block_type=HATMaskResNetBlockLarge, # use the smaller building block for ResNet-18 1455 building_block_nums=(3, 4, 23, 3), 1456 building_block_preceding_output_channels=(64, 256, 512, 1024), 1457 building_block_input_channels=(64, 128, 256, 512), 1458 output_dim=output_dim, 1459 gate=gate, 1460 activation_layer=activation_layer, 1461 bias=bias, 1462 ) 1463 1464 1465class HATMaskResNet152(HATMaskResNetBase): 1466 r"""HAT masked ResNet-152 backbone network. 1467 1468 [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters. 1469 1470 ResNet-152 is the largest architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 152 weight convolutional layers in total. See Table 1 in the paper for details. 1471 1472 Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function. 1473 """ 1474 1475 def __init__( 1476 self, 1477 input_channels: int, 1478 output_dim: int, 1479 gate: str, 1480 activation_layer: nn.Module | None = nn.ReLU, 1481 bias: bool = False, 1482 ) -> None: 1483 r"""Construct and initialise the ResNet-152 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism. 1484 1485 **Args:** 1486 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 1487 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 1488 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 1489 - `sigmoid`: the sigmoid function. 1490 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 1491 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 1492 """ 1493 HATMaskResNetBase.__init__( 1494 self, 1495 input_channels=input_channels, 1496 building_block_type=HATMaskResNetBlockLarge, # use the smaller building block for ResNet-152 1497 building_block_nums=(3, 8, 36, 3), 1498 building_block_preceding_output_channels=(64, 256, 512, 1024), 1499 building_block_input_channels=(64, 128, 256, 512), 1500 output_dim=output_dim, 1501 gate=gate, 1502 activation_layer=activation_layer, 1503 bias=bias, 1504 )
31class ResNetBlockSmall(CLBackbone): 32 r"""The smaller building block for ResNet-18/34. 33 34 It consists of 2 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (left) in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). 35 """ 36 37 def __init__( 38 self, 39 outer_layer_name: str, 40 block_idx: int, 41 preceding_output_channels: int, 42 input_channels: int, 43 overall_stride: int, 44 activation_layer: nn.Module | None = nn.ReLU, 45 batch_normalisation: bool = True, 46 bias: bool = False, 47 ) -> None: 48 r"""Construct and initialise the smaller building block. 49 50 **Args:** 51 - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer. 52 - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer. 53 - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block. 54 - **input_channels** (`int`): the number of channels of input of this building block. 55 - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (last) convolutional layer where the 1st convolutional layer remain stride of 1. 56 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 57 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 58 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 59 """ 60 CLBackbone.__init__(self, output_dim=None) 61 62 self.batch_normalisation: bool = batch_normalisation 63 r"""Store whether to use batch normalisation after the fully-connected layers.""" 64 self.activation: bool = activation_layer is not None 65 r"""Store whether to use activation function after the fully-connected layers.""" 66 67 self.full_1st_layer_name = f"{outer_layer_name}/{block_idx}/conv1" 68 r"""Format and store full name of the 1st weighted convolutional layer. """ 69 self.full_2nd_layer_name = f"{outer_layer_name}/{block_idx}/conv2" 70 r"""Format and store full name of the 2nd weighted convolutional layer. """ 71 72 # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc) 73 layer_input_channels = preceding_output_channels # the input channels of the 1st convolutional layer, which receive the output channels of the preceding module 74 layer_output_channels = ( 75 input_channels # the output channels of the 1st convolutional layer 76 ) 77 self.conv1 = nn.Conv2d( 78 in_channels=layer_input_channels, 79 out_channels=layer_output_channels, 80 kernel_size=3, 81 stride=1, 82 padding=1, 83 bias=bias, 84 ) # construct the 1st weight convolutional layer of the smaller building block. Overall stride is not performed here 85 r"""The 1st weight convolutional layer of the smaller building block. """ 86 self.weighted_layer_names.append( 87 self.full_1st_layer_name 88 ) # update the weighted layer names 89 if self.batch_normalisation: 90 self.conv_bn1 = nn.BatchNorm2d( 91 num_features=layer_output_channels 92 ) # construct the batch normalisation layer 93 r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """ 94 if self.activation: 95 self.conv_activation1 = activation_layer() # construct the activation layer 96 r"""The activation layer after the 1st weighted convolutional layer. """ 97 98 # construct the 2nd weighted convolutional layer and attached layers (batchnorm, activation, etc) 99 layer_input_channels = input_channels # the input channels of the 2nd convolutional layer, which is `input_channels`, the same as the output channels of the 1st convolutional layer 100 layer_output_channels = ( 101 input_channels * 1 102 ) # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion) 103 self.conv2 = nn.Conv2d( 104 in_channels=layer_input_channels, 105 out_channels=layer_output_channels, 106 kernel_size=3, 107 stride=overall_stride, 108 padding=1, 109 bias=bias, 110 ) # construct the 2nd weight convolutional layer of the smaller building block. Overall stride is performed here 111 r"""The 2nd weight convolutional layer of the smaller building block. """ 112 self.weighted_layer_names.append( 113 self.full_2nd_layer_name 114 ) # update the weighted layer names 115 if batch_normalisation: 116 self.conv_bn2 = nn.BatchNorm2d( 117 num_features=layer_output_channels 118 ) # construct the batch normalisation layer 119 r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 2nd weighted convolutional layer. """ 120 if self.activation: 121 self.conv_activation2 = activation_layer() # construct the activation layer 122 r"""The activation layer after the 2nd weighted convolutional layer. """ 123 124 self.identity_downsample: nn.Module = ( 125 nn.Conv2d( 126 in_channels=preceding_output_channels, 127 out_channels=input_channels, 128 kernel_size=1, 129 stride=overall_stride, 130 bias=False, 131 ) 132 if preceding_output_channels != input_channels or overall_stride != 1 133 else None 134 ) # construct the identity downsample function 135 r"""The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists. """ 136 137 def forward(self, input: Tensor) -> tuple[Tensor, dict[str, Tensor]]: 138 r"""The forward pass for data. 139 140 **Args:** 141 - **input** (`Tensor`): the input feature maps. 142 143 **Returns:** 144 - **output_feature** (`Tensor`): the output feature maps. 145 - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. 146 """ 147 hidden_features = {} 148 149 identity = ( 150 self.identity_downsample(input) 151 if self.identity_downsample is not None 152 else input 153 ) # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's 154 155 x = input 156 x = self.conv1(x) 157 if self.batch_normalisation: 158 x = self.conv_bn1(x) 159 if self.activation: 160 x = self.conv_activation1(x) 161 hidden_features[self.full_1st_layer_name] = x # store the hidden feature 162 163 x = self.conv2(x) 164 if self.batch_normalisation: 165 x = self.conv_bn2(x) 166 167 x = x + identity 168 if self.activation: 169 x = self.conv_activation2(x) # activation after the shortcut connection 170 hidden_features[self.full_2nd_layer_name] = x # store the hidden feature 171 172 output_feature = x 173 174 return output_feature, hidden_features
The smaller building block for ResNet-18/34.
It consists of 2 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (left) in the original ResNet paper.
37 def __init__( 38 self, 39 outer_layer_name: str, 40 block_idx: int, 41 preceding_output_channels: int, 42 input_channels: int, 43 overall_stride: int, 44 activation_layer: nn.Module | None = nn.ReLU, 45 batch_normalisation: bool = True, 46 bias: bool = False, 47 ) -> None: 48 r"""Construct and initialise the smaller building block. 49 50 **Args:** 51 - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer. 52 - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer. 53 - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block. 54 - **input_channels** (`int`): the number of channels of input of this building block. 55 - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (last) convolutional layer where the 1st convolutional layer remain stride of 1. 56 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 57 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 58 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 59 """ 60 CLBackbone.__init__(self, output_dim=None) 61 62 self.batch_normalisation: bool = batch_normalisation 63 r"""Store whether to use batch normalisation after the fully-connected layers.""" 64 self.activation: bool = activation_layer is not None 65 r"""Store whether to use activation function after the fully-connected layers.""" 66 67 self.full_1st_layer_name = f"{outer_layer_name}/{block_idx}/conv1" 68 r"""Format and store full name of the 1st weighted convolutional layer. """ 69 self.full_2nd_layer_name = f"{outer_layer_name}/{block_idx}/conv2" 70 r"""Format and store full name of the 2nd weighted convolutional layer. """ 71 72 # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc) 73 layer_input_channels = preceding_output_channels # the input channels of the 1st convolutional layer, which receive the output channels of the preceding module 74 layer_output_channels = ( 75 input_channels # the output channels of the 1st convolutional layer 76 ) 77 self.conv1 = nn.Conv2d( 78 in_channels=layer_input_channels, 79 out_channels=layer_output_channels, 80 kernel_size=3, 81 stride=1, 82 padding=1, 83 bias=bias, 84 ) # construct the 1st weight convolutional layer of the smaller building block. Overall stride is not performed here 85 r"""The 1st weight convolutional layer of the smaller building block. """ 86 self.weighted_layer_names.append( 87 self.full_1st_layer_name 88 ) # update the weighted layer names 89 if self.batch_normalisation: 90 self.conv_bn1 = nn.BatchNorm2d( 91 num_features=layer_output_channels 92 ) # construct the batch normalisation layer 93 r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """ 94 if self.activation: 95 self.conv_activation1 = activation_layer() # construct the activation layer 96 r"""The activation layer after the 1st weighted convolutional layer. """ 97 98 # construct the 2nd weighted convolutional layer and attached layers (batchnorm, activation, etc) 99 layer_input_channels = input_channels # the input channels of the 2nd convolutional layer, which is `input_channels`, the same as the output channels of the 1st convolutional layer 100 layer_output_channels = ( 101 input_channels * 1 102 ) # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion) 103 self.conv2 = nn.Conv2d( 104 in_channels=layer_input_channels, 105 out_channels=layer_output_channels, 106 kernel_size=3, 107 stride=overall_stride, 108 padding=1, 109 bias=bias, 110 ) # construct the 2nd weight convolutional layer of the smaller building block. Overall stride is performed here 111 r"""The 2nd weight convolutional layer of the smaller building block. """ 112 self.weighted_layer_names.append( 113 self.full_2nd_layer_name 114 ) # update the weighted layer names 115 if batch_normalisation: 116 self.conv_bn2 = nn.BatchNorm2d( 117 num_features=layer_output_channels 118 ) # construct the batch normalisation layer 119 r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 2nd weighted convolutional layer. """ 120 if self.activation: 121 self.conv_activation2 = activation_layer() # construct the activation layer 122 r"""The activation layer after the 2nd weighted convolutional layer. """ 123 124 self.identity_downsample: nn.Module = ( 125 nn.Conv2d( 126 in_channels=preceding_output_channels, 127 out_channels=input_channels, 128 kernel_size=1, 129 stride=overall_stride, 130 bias=False, 131 ) 132 if preceding_output_channels != input_channels or overall_stride != 1 133 else None 134 ) # construct the identity downsample function 135 r"""The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists. """
Construct and initialise the smaller building block.
Args:
- outer_layer_name (
str
): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer. - block_idx (
int
): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer. - preceding_output_channels (
int
): the number of channels of preceding output of this particular building block. - input_channels (
int
): the number of channels of input of this building block. - overall_stride (
int
): the overall stride of this building block. This stride is performed at 2nd (last) convolutional layer where the 1st convolutional layer remain stride of 1. - activation_layer (
nn.Module
): activation function of each layer (if notNone
), ifNone
this layer won't be used. Defaultnn.ReLU
. - batch_normalisation (
bool
): whether to use batch normalisation after the weight convolutional layers. DefaultTrue
, same as what the original ResNet paper does. - bias (
bool
): whether to use bias in the convolutional layer. DefaultFalse
, because batch normalisation are doing the similar thing with bias.
Store whether to use batch normalisation after the fully-connected layers.
The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists.
137 def forward(self, input: Tensor) -> tuple[Tensor, dict[str, Tensor]]: 138 r"""The forward pass for data. 139 140 **Args:** 141 - **input** (`Tensor`): the input feature maps. 142 143 **Returns:** 144 - **output_feature** (`Tensor`): the output feature maps. 145 - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. 146 """ 147 hidden_features = {} 148 149 identity = ( 150 self.identity_downsample(input) 151 if self.identity_downsample is not None 152 else input 153 ) # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's 154 155 x = input 156 x = self.conv1(x) 157 if self.batch_normalisation: 158 x = self.conv_bn1(x) 159 if self.activation: 160 x = self.conv_activation1(x) 161 hidden_features[self.full_1st_layer_name] = x # store the hidden feature 162 163 x = self.conv2(x) 164 if self.batch_normalisation: 165 x = self.conv_bn2(x) 166 167 x = x + identity 168 if self.activation: 169 x = self.conv_activation2(x) # activation after the shortcut connection 170 hidden_features[self.full_2nd_layer_name] = x # store the hidden feature 171 172 output_feature = x 173 174 return output_feature, hidden_features
The forward pass for data.
Args:
- input (
Tensor
): the input feature maps.
Returns:
- output_feature (
Tensor
): the output feature maps. - hidden_features (
dict[str, Tensor]
): the hidden features (after activation) in each weighted layer. Key (str
) is the weighted layer name, value (Tensor
) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
177class ResNetBlockLarge(CLBackbone): 178 r"""The larger building block for ResNet-50/101/152. It is referred to "bottleneck" building block in the paper. 179 180 It consists of 3 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (right) in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). 181 """ 182 183 def __init__( 184 self, 185 outer_layer_name: str, 186 block_idx: int, 187 preceding_output_channels: int, 188 input_channels: int, 189 overall_stride: int, 190 activation_layer: nn.Module | None = nn.ReLU, 191 batch_normalisation: bool = True, 192 bias: bool = False, 193 ) -> None: 194 r"""Construct and initialise the larger building block. 195 196 **Args:** 197 - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer. 198 - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer. 199 - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block. 200 - **input_channels** (`int`): the number of channels of input of this building block. 201 - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (middle) convolutional layer where 1st and 3rd convolutional layers remain stride of 1. 202 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 203 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 204 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 205 """ 206 CLBackbone.__init__(self, output_dim=None) 207 208 self.batch_normalisation: bool = batch_normalisation 209 r"""Store whether to use batch normalisation after the fully-connected layers.""" 210 self.activation: bool = activation_layer is not None 211 r"""Store whether to use activation function after the fully-connected layers.""" 212 213 self.full_1st_layer_name = f"{outer_layer_name}/{block_idx}/conv1" 214 r"""Format and store full name of the 1st weighted convolutional layer. """ 215 self.full_2nd_layer_name = f"{outer_layer_name}_{block_idx}_conv2" 216 r"""Format and store full name of the 2nd weighted convolutional layer. """ 217 self.full_3rd_layer_name = f"{outer_layer_name}_{block_idx}_conv3" 218 r"""Format and store full name of the 3rd weighted convolutional layer. """ 219 220 # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc) 221 layer_input_channels = preceding_output_channels # the input channels of the 1st convolutional layer, which receive the output channels of the preceding module 222 layer_output_channels = ( 223 input_channels # the output channels of the 1st convolutional layer 224 ) 225 self.conv1 = nn.Conv2d( 226 in_channels=layer_input_channels, 227 out_channels=layer_output_channels, 228 kernel_size=1, 229 stride=1, 230 padding=0, 231 bias=bias, 232 ) # construct the 1st weight convolutional layer of the larger building block. Overall stride is not performed here 233 r"""The 1st weight convolutional layer of the larger building block. """ 234 self.weighted_layer_names.append( 235 self.full_1st_layer_name 236 ) # update the weighted layer names 237 if self.batch_normalisation: 238 self.conv_bn1 = nn.BatchNorm2d( 239 num_features=layer_output_channels 240 ) # construct the batch normalisation layer 241 r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """ 242 if self.activation: 243 self.conv_activation1 = activation_layer() # construct the activation layer 244 r"""The activation layer after the 1st weighted convolutional layer. """ 245 246 # construct the 2nd weighted convolutional layer and attached layers (batchnorm, activation, etc) 247 layer_input_channels = input_channels # the input channels of the 2nd convolutional layer, which is `input_channels`, the same as the output channels of the 1st convolutional layer 248 layer_output_channels = ( 249 input_channels 250 * 1 # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion) 251 ) 252 self.conv2 = nn.Conv2d( 253 in_channels=layer_input_channels, 254 out_channels=layer_output_channels, 255 kernel_size=3, 256 stride=overall_stride, 257 padding=1, 258 bias=bias, 259 ) # construct the 2nd weight convolutional layer of the larger building block. Overall stride is performed here 260 r"""The 2nd weight convolutional layer of the larger building block. """ 261 self.weighted_layer_names.append( 262 self.full_2nd_layer_name 263 ) # update the weighted layer names 264 if self.batch_normalisation: 265 self.conv_bn2 = nn.BatchNorm2d( 266 num_features=layer_output_channels 267 ) # construct the batch normalisation layer 268 r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 2nd weighted convolutional layer. """ 269 if self.activation: 270 self.conv_activation2 = activation_layer() # construct the activation layer 271 r"""The activation layer after the 2nd weighted convolutional layer. """ 272 273 # construct the 3rd weighted convolutional layer and attached layers (batchnorm, activation, etc) 274 layer_input_channels = ( 275 input_channels * 1 276 ) # the input channels of the 2nd convolutional layer, which is `input_channels * 1`, the same as the output channels of the 1st convolutional layer 277 layer_output_channels = ( 278 input_channels 279 * 4 # the output channels of the 2nd convolutional layer, which is 4 times expanded as the input channels 280 ) 281 self.conv3 = nn.Conv2d( 282 in_channels=layer_input_channels, 283 out_channels=layer_output_channels, 284 kernel_size=1, 285 stride=1, 286 padding=0, 287 bias=bias, 288 ) # construct the 3rd weight convolutional layer of the larger building block. Overall stride is not performed here 289 r"""The 3rd weight convolutional layer of the larger building block. """ 290 self.weighted_layer_names.append( 291 self.full_3rd_layer_name 292 ) # update the weighted layer names 293 if batch_normalisation: 294 self.conv_bn3 = nn.BatchNorm2d( 295 num_features=layer_output_channels 296 ) # construct the batch normalisation layer 297 r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 3rd weighted convolutional layer. """ 298 if self.activation: 299 self.conv_activation3 = activation_layer() # construct the activation layer 300 r"""The activation layer after the 3rd weighted convolutional layer. """ 301 302 self.identity_downsample: nn.Module = ( 303 nn.Conv2d( 304 in_channels=preceding_output_channels, 305 out_channels=input_channels * 4, 306 kernel_size=1, 307 stride=overall_stride, 308 bias=False, 309 ) 310 if preceding_output_channels != input_channels * 4 or overall_stride != 1 311 else None 312 ) 313 r"""The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists. """ 314 315 def forward(self, input: Tensor) -> tuple[Tensor, dict[str, Tensor]]: 316 r"""The forward pass for data. 317 318 **Args:** 319 - **input** (`Tensor`): the input feature maps. 320 321 **Returns:** 322 - **output_feature** (`Tensor`): the output feature maps. 323 - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. 324 """ 325 hidden_features = {} 326 327 identity = ( 328 self.identity_downsample(input) 329 if self.identity_downsample is not None 330 else input 331 ) # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's 332 333 x = input 334 x = self.conv1(x) 335 if self.batch_normalisation: 336 x = self.conv_bn1(x) 337 if self.activation: 338 x = self.conv_activation1(x) 339 hidden_features[self.full_1st_layer_name] = x # store the hidden feature 340 341 x = self.conv2(x) 342 if self.batch_normalisation: 343 x = self.conv_bn2(x) 344 if self.activation: 345 x = self.conv_activation2(x) 346 hidden_features[self.full_2nd_layer_name] = x # store the hidden feature 347 348 x = self.conv3(x) 349 if self.batch_normalisation: 350 x = self.conv_bn3(x) 351 352 x = x + identity 353 if self.activation: 354 x = self.conv_activation3(x) # activation after the shortcut connection 355 hidden_features[self.full_3rd_layer_name] = x # store the hidden feature 356 357 output_feature = x 358 359 return output_feature, hidden_features
The larger building block for ResNet-50/101/152. It is referred to "bottleneck" building block in the paper.
It consists of 3 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (right) in the original ResNet paper.
183 def __init__( 184 self, 185 outer_layer_name: str, 186 block_idx: int, 187 preceding_output_channels: int, 188 input_channels: int, 189 overall_stride: int, 190 activation_layer: nn.Module | None = nn.ReLU, 191 batch_normalisation: bool = True, 192 bias: bool = False, 193 ) -> None: 194 r"""Construct and initialise the larger building block. 195 196 **Args:** 197 - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer. 198 - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer. 199 - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block. 200 - **input_channels** (`int`): the number of channels of input of this building block. 201 - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (middle) convolutional layer where 1st and 3rd convolutional layers remain stride of 1. 202 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 203 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 204 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 205 """ 206 CLBackbone.__init__(self, output_dim=None) 207 208 self.batch_normalisation: bool = batch_normalisation 209 r"""Store whether to use batch normalisation after the fully-connected layers.""" 210 self.activation: bool = activation_layer is not None 211 r"""Store whether to use activation function after the fully-connected layers.""" 212 213 self.full_1st_layer_name = f"{outer_layer_name}/{block_idx}/conv1" 214 r"""Format and store full name of the 1st weighted convolutional layer. """ 215 self.full_2nd_layer_name = f"{outer_layer_name}_{block_idx}_conv2" 216 r"""Format and store full name of the 2nd weighted convolutional layer. """ 217 self.full_3rd_layer_name = f"{outer_layer_name}_{block_idx}_conv3" 218 r"""Format and store full name of the 3rd weighted convolutional layer. """ 219 220 # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc) 221 layer_input_channels = preceding_output_channels # the input channels of the 1st convolutional layer, which receive the output channels of the preceding module 222 layer_output_channels = ( 223 input_channels # the output channels of the 1st convolutional layer 224 ) 225 self.conv1 = nn.Conv2d( 226 in_channels=layer_input_channels, 227 out_channels=layer_output_channels, 228 kernel_size=1, 229 stride=1, 230 padding=0, 231 bias=bias, 232 ) # construct the 1st weight convolutional layer of the larger building block. Overall stride is not performed here 233 r"""The 1st weight convolutional layer of the larger building block. """ 234 self.weighted_layer_names.append( 235 self.full_1st_layer_name 236 ) # update the weighted layer names 237 if self.batch_normalisation: 238 self.conv_bn1 = nn.BatchNorm2d( 239 num_features=layer_output_channels 240 ) # construct the batch normalisation layer 241 r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """ 242 if self.activation: 243 self.conv_activation1 = activation_layer() # construct the activation layer 244 r"""The activation layer after the 1st weighted convolutional layer. """ 245 246 # construct the 2nd weighted convolutional layer and attached layers (batchnorm, activation, etc) 247 layer_input_channels = input_channels # the input channels of the 2nd convolutional layer, which is `input_channels`, the same as the output channels of the 1st convolutional layer 248 layer_output_channels = ( 249 input_channels 250 * 1 # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion) 251 ) 252 self.conv2 = nn.Conv2d( 253 in_channels=layer_input_channels, 254 out_channels=layer_output_channels, 255 kernel_size=3, 256 stride=overall_stride, 257 padding=1, 258 bias=bias, 259 ) # construct the 2nd weight convolutional layer of the larger building block. Overall stride is performed here 260 r"""The 2nd weight convolutional layer of the larger building block. """ 261 self.weighted_layer_names.append( 262 self.full_2nd_layer_name 263 ) # update the weighted layer names 264 if self.batch_normalisation: 265 self.conv_bn2 = nn.BatchNorm2d( 266 num_features=layer_output_channels 267 ) # construct the batch normalisation layer 268 r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 2nd weighted convolutional layer. """ 269 if self.activation: 270 self.conv_activation2 = activation_layer() # construct the activation layer 271 r"""The activation layer after the 2nd weighted convolutional layer. """ 272 273 # construct the 3rd weighted convolutional layer and attached layers (batchnorm, activation, etc) 274 layer_input_channels = ( 275 input_channels * 1 276 ) # the input channels of the 2nd convolutional layer, which is `input_channels * 1`, the same as the output channels of the 1st convolutional layer 277 layer_output_channels = ( 278 input_channels 279 * 4 # the output channels of the 2nd convolutional layer, which is 4 times expanded as the input channels 280 ) 281 self.conv3 = nn.Conv2d( 282 in_channels=layer_input_channels, 283 out_channels=layer_output_channels, 284 kernel_size=1, 285 stride=1, 286 padding=0, 287 bias=bias, 288 ) # construct the 3rd weight convolutional layer of the larger building block. Overall stride is not performed here 289 r"""The 3rd weight convolutional layer of the larger building block. """ 290 self.weighted_layer_names.append( 291 self.full_3rd_layer_name 292 ) # update the weighted layer names 293 if batch_normalisation: 294 self.conv_bn3 = nn.BatchNorm2d( 295 num_features=layer_output_channels 296 ) # construct the batch normalisation layer 297 r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 3rd weighted convolutional layer. """ 298 if self.activation: 299 self.conv_activation3 = activation_layer() # construct the activation layer 300 r"""The activation layer after the 3rd weighted convolutional layer. """ 301 302 self.identity_downsample: nn.Module = ( 303 nn.Conv2d( 304 in_channels=preceding_output_channels, 305 out_channels=input_channels * 4, 306 kernel_size=1, 307 stride=overall_stride, 308 bias=False, 309 ) 310 if preceding_output_channels != input_channels * 4 or overall_stride != 1 311 else None 312 ) 313 r"""The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists. """
Construct and initialise the larger building block.
Args:
- outer_layer_name (
str
): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer. - block_idx (
int
): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer. - preceding_output_channels (
int
): the number of channels of preceding output of this particular building block. - input_channels (
int
): the number of channels of input of this building block. - overall_stride (
int
): the overall stride of this building block. This stride is performed at 2nd (middle) convolutional layer where 1st and 3rd convolutional layers remain stride of 1. - activation_layer (
nn.Module
): activation function of each layer (if notNone
), ifNone
this layer won't be used. Defaultnn.ReLU
. - batch_normalisation (
bool
): whether to use batch normalisation after the weight convolutional layers. DefaultTrue
, same as what the original ResNet paper does. - bias (
bool
): whether to use bias in the convolutional layer. DefaultFalse
, because batch normalisation are doing the similar thing with bias.
Store whether to use batch normalisation after the fully-connected layers.
The convolutional layer for downsampling identity in the shortcut connection if the dimension of identity from input doesn't match the output's. This case only happens when the number of input channels doesn't equal to the number of preceding output channels or a layer with stride > 1 exists.
315 def forward(self, input: Tensor) -> tuple[Tensor, dict[str, Tensor]]: 316 r"""The forward pass for data. 317 318 **Args:** 319 - **input** (`Tensor`): the input feature maps. 320 321 **Returns:** 322 - **output_feature** (`Tensor`): the output feature maps. 323 - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. 324 """ 325 hidden_features = {} 326 327 identity = ( 328 self.identity_downsample(input) 329 if self.identity_downsample is not None 330 else input 331 ) # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's 332 333 x = input 334 x = self.conv1(x) 335 if self.batch_normalisation: 336 x = self.conv_bn1(x) 337 if self.activation: 338 x = self.conv_activation1(x) 339 hidden_features[self.full_1st_layer_name] = x # store the hidden feature 340 341 x = self.conv2(x) 342 if self.batch_normalisation: 343 x = self.conv_bn2(x) 344 if self.activation: 345 x = self.conv_activation2(x) 346 hidden_features[self.full_2nd_layer_name] = x # store the hidden feature 347 348 x = self.conv3(x) 349 if self.batch_normalisation: 350 x = self.conv_bn3(x) 351 352 x = x + identity 353 if self.activation: 354 x = self.conv_activation3(x) # activation after the shortcut connection 355 hidden_features[self.full_3rd_layer_name] = x # store the hidden feature 356 357 output_feature = x 358 359 return output_feature, hidden_features
The forward pass for data.
Args:
- input (
Tensor
): the input feature maps.
Returns:
- output_feature (
Tensor
): the output feature maps. - hidden_features (
dict[str, Tensor]
): the hidden features (after activation) in each weighted layer. Key (str
) is the weighted layer name, value (Tensor
) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
362class ResNetBase(CLBackbone): 363 r"""The base class of [residual network (ResNet)](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). 364 365 ResNet is a convolutional network architecture, which has 1st convolutional parameter layer and a maxpooling layer, connecting to 4 convolutional layers which contains multiple convolutional parameter layer. Each layer of the 4 are constructed from basic building blocks which are either small (`ResNetBlockSmall`) or large (`ResNetBlockLarge`). Each building block contains several convolutional parameter layers. The building blocks are connected by a skip connection which is a direct connection from the input of the block to the output of the block, and this is why it's called residual (find "shortcut connections" in the paper for more details). After the 5th convolutional layer, there are average pooling layer and a fully connected layer which connects to the CL output heads. 366 """ 367 368 def __init__( 369 self, 370 input_channels: int, 371 building_block_type: ResNetBlockSmall | ResNetBlockLarge, 372 building_block_nums: tuple[int, int, int, int], 373 building_block_preceding_output_channels: tuple[int, int, int, int], 374 building_block_input_channels: tuple[int, int, int, int], 375 output_dim: int, 376 activation_layer: nn.Module | None = nn.ReLU, 377 batch_normalisation: bool = True, 378 bias: bool = False, 379 ) -> None: 380 r"""Construct and initialise the ResNet backbone network. 381 382 **Args:** 383 - **input_channels** (`int`): the number of channels of input. Image data are kept channels when going in ResNet. Note that convolutional networks require number of input channels instead of dimension. 384 - **building_block_type** (`ResNetBlockSmall` | `ResNetBlockLarge`): the type of building block used in the ResNet. 385 - **building_block_nums** (`tuple[int, int, int, int]`): the number of building blocks in the 2-5 convolutional layer correspondingly. 386 - **building_block_preceding_output_channels** (`tuple[int, int, int, int]`): the number of channels of preceding output of each building block in the 2-5 convolutional layer correspondingly. 387 - **building_block_input_channels** (`tuple[int, int, int, int]`): the number of channels of input of each building block in the 2-5 convolutional layer correspondingly. 388 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 389 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 390 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 391 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 392 """ 393 CLBackbone.__init__(self, output_dim=output_dim) 394 395 self.batch_normalisation: bool = batch_normalisation 396 r"""Store whether to use batch normalisation after the fully-connected layers.""" 397 self.activation: bool = activation_layer is not None 398 r"""Store whether to use activation function after the fully-connected layers.""" 399 400 # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc) 401 layer_input_channels = input_channels # the input channels of the 1st convolutional layer, which receive the input of the entire network 402 layer_output_channels = 64 # the output channels of the 1st convolutional layer 403 self.conv1 = nn.Conv2d( 404 in_channels=layer_input_channels, 405 out_channels=layer_output_channels, 406 kernel_size=7, 407 stride=2, 408 padding=3, 409 bias=bias, 410 ) # construct the 1st weight convolutional layer of the entire ResNet 411 r"""The 1st weight convolutional layer of the entire ResNet. It is always with fixed kernel size 7x7, stride 2, and padding 3. """ 412 self.weighted_layer_names.append("conv1") # collect the layer name to be masked 413 if self.batch_normalisation: 414 self.conv_bn1 = nn.BatchNorm2d( 415 num_features=layer_output_channels 416 ) # construct the batch normalisation layer 417 r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """ 418 if self.activation: 419 self.conv_activation1 = activation_layer() # construct the activation layer 420 r"""The activation layer after the 1st weighted convolutional layer. """ 421 422 self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) # 423 r"""The max pooling layer which is laid in between 1st and 2nd convolutional layers with kernel size 3x3, stride 2. """ 424 425 # construct the 2nd convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc) 426 self.conv2x = self._multiple_blocks( 427 layer_name="conv2x", 428 building_block_type=building_block_type, 429 building_block_num=building_block_nums[0], 430 preceding_output_channels=building_block_preceding_output_channels[0], 431 input_channels=building_block_input_channels[0], 432 overall_stride=1, # the overall stride of the 2nd convolutional layer should be 1, as the preceding maxpooling layer has stride 2, which already made 112x112 -> 56x56. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details. 433 activation_layer=activation_layer, 434 batch_normalisation=batch_normalisation, 435 bias=bias, 436 ) 437 r"""The 2nd convolutional layer of the ResNet, which contains multiple blocks. """ 438 439 # construct the 3rd convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc) 440 self.conv3x = self._multiple_blocks( 441 layer_name="conv3x", 442 building_block_type=building_block_type, 443 building_block_num=building_block_nums[1], 444 preceding_output_channels=building_block_preceding_output_channels[1], 445 input_channels=building_block_input_channels[1], 446 overall_stride=2, # the overall stride of the 3rd convolutional layer should be 2, making 56x56 -> 28x28. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details. 447 activation_layer=activation_layer, 448 batch_normalisation=batch_normalisation, 449 bias=bias, 450 ) 451 r"""The 3rd convolutional layer of the ResNet, which contains multiple blocks. """ 452 453 # construct the 4th convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc) 454 self.conv4x = self._multiple_blocks( 455 layer_name="conv4x", 456 building_block_type=building_block_type, 457 building_block_num=building_block_nums[2], 458 preceding_output_channels=building_block_preceding_output_channels[2], 459 input_channels=building_block_input_channels[2], 460 overall_stride=2, # the overall stride of the 4th convolutional layer should be 2, making 28x28 -> 14x14. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details. 461 activation_layer=activation_layer, 462 batch_normalisation=batch_normalisation, 463 bias=bias, 464 ) 465 r"""The 4th convolutional layer of the ResNet, which contains multiple blocks. """ 466 467 # construct the 5th convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc) 468 self.conv5x = self._multiple_blocks( 469 layer_name="conv5x", 470 building_block_type=building_block_type, 471 building_block_num=building_block_nums[3], 472 preceding_output_channels=building_block_preceding_output_channels[3], 473 input_channels=building_block_input_channels[3], 474 overall_stride=2, # the overall stride of the 2nd convolutional layer should be 2, making 14x14 -> 7x7. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details. 475 activation_layer=activation_layer, 476 batch_normalisation=batch_normalisation, 477 bias=bias, 478 ) 479 r"""The 5th convolutional layer of the ResNet, which contains multiple blocks. """ 480 481 self.avepool = nn.AdaptiveAvgPool2d(output_size=(1, 1)) 482 r"""The average pooling layer which is laid after the convolutional layers and before feature maps are flattened. """ 483 484 def _multiple_blocks( 485 self, 486 layer_name: str, 487 building_block_type: ResNetBlockSmall | ResNetBlockLarge, 488 building_block_num: int, 489 preceding_output_channels: int, 490 input_channels: int, 491 overall_stride: int, 492 activation_layer: nn.Module | None = nn.ReLU, 493 batch_normalisation: bool = True, 494 bias: bool = False, 495 ) -> nn.Sequential: 496 r"""Construct a layer consisting of multiple building blocks. It's used to construct the 2-5 convolutional layers of the ResNet. 497 498 The "shortcut connections" are performed between the input and output of each building block: 499 1. If the input and output of the building block have exactly the same dimensions (including number of channels and size), add the input to the output. 500 2. If the input and output of the building block have different dimensions (including number of channels and size), add the input to the output after a convolutional layer to make the dimensions match. 501 502 **Args:** 503 - **layer_name** (`str`): pass the name of this multi-building-block layer to construct the full name of each weighted convolutional layer. 504 - **building_block_type** (`ResNetBlockSmall` | `ResNetBlockLarge`): the type of the building block. 505 - **building_block_num** (`int`): the number of building blocks in this multi-building-block layer. 506 - **preceding_output_channels** (`int`): the number of channels of preceding output of this entire multi-building-block layer. 507 - **input_channels** (`int`): the number of channels of input of this multi-building-block layer. 508 - **overall_stride** (`int`): the overall stride of the building blocks. This stride is performed at the 1st building block where other building blocks remain their own overall stride of 1. Inside that building block, this stride is performed at certain convolutional layer in the building block where other convolutional layers remain stride of 1: 509 - For `ResNetBlockSmall`, it performs at the 2nd (last) layer. 510 - For `ResNetBlockLarge`, it performs at the 2nd (middle) layer. 511 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 512 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 513 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 514 515 **Returns:** 516 - **layer** (`nn.Sequential`): the constructed layer consisting of multiple building blocks. 517 """ 518 519 layer = [] 520 521 for block_idx in range(building_block_num): 522 layer.append( 523 building_block_type( 524 outer_layer_name=layer_name, 525 block_idx=block_idx, 526 preceding_output_channels=( 527 preceding_output_channels 528 if block_idx == 0 529 else ( 530 input_channels 531 if building_block_type == ResNetBlockSmall 532 else input_channels * 4 533 ) 534 ), # if it's the 1st block in this multi-building-block layer, it should be the number of channels of the preceding output of this entire multi-building-block layer. Otherwise, it should be the number of channels from last building block where the number of channels is 4 times expanded as the input channels for `ResNetBlockLarge` than `ResNetBlockSmall`. 535 input_channels=input_channels, 536 overall_stride=( 537 overall_stride if block_idx == 0 else 1 538 ), # only perform the overall stride at the 1st block in this multi-building-block layer 539 activation_layer=activation_layer, 540 batch_normalisation=batch_normalisation, 541 bias=bias, 542 ) 543 ) 544 545 self.weighted_layer_names += layer[ 546 -1 547 ].weighted_layer_names # collect the weighted layer names in the blocks and sync to the weighted layer names list in the outer network 548 549 return nn.Sequential(*layer) 550 551 def forward( 552 self, input: Tensor, stage: str = None, task_id: int | None = None 553 ) -> tuple[Tensor, dict[str, Tensor]]: 554 r"""The forward pass for data. It is the same for all tasks. 555 556 **Args:** 557 - **input** (`Tensor`): the input tensor from data. 558 559 **Returns:** 560 - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation. 561 - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. 562 """ 563 batch_size = input.size(0) 564 hidden_features = {} 565 566 x = input 567 568 x = self.conv1(x) 569 if self.batch_normalisation: 570 x = self.conv_bn1(x) 571 if self.activation: 572 x = self.conv_activation1(x) 573 hidden_features["conv1"] = x 574 575 x = self.maxpool(x) 576 577 for block in self.conv2x: 578 x, hidden_features_block = block(x) 579 hidden_features.update(hidden_features_block) # store the hidden feature 580 for block in self.conv3x: 581 x, hidden_features_block = block(x) 582 hidden_features.update(hidden_features_block) # store the hidden feature 583 for block in self.conv4x: 584 x, hidden_features_block = block(x) 585 hidden_features.update(hidden_features_block) # store the hidden feature 586 for block in self.conv5x: 587 x, hidden_features_block = block(x) 588 hidden_features.update(hidden_features_block) # store the hidden feature 589 590 x = self.avepool(x) 591 592 output_feature = x.view(batch_size, -1) # flatten before going through heads 593 594 return output_feature, hidden_features
The base class of residual network (ResNet).
ResNet is a convolutional network architecture, which has 1st convolutional parameter layer and a maxpooling layer, connecting to 4 convolutional layers which contains multiple convolutional parameter layer. Each layer of the 4 are constructed from basic building blocks which are either small (ResNetBlockSmall
) or large (ResNetBlockLarge
). Each building block contains several convolutional parameter layers. The building blocks are connected by a skip connection which is a direct connection from the input of the block to the output of the block, and this is why it's called residual (find "shortcut connections" in the paper for more details). After the 5th convolutional layer, there are average pooling layer and a fully connected layer which connects to the CL output heads.
368 def __init__( 369 self, 370 input_channels: int, 371 building_block_type: ResNetBlockSmall | ResNetBlockLarge, 372 building_block_nums: tuple[int, int, int, int], 373 building_block_preceding_output_channels: tuple[int, int, int, int], 374 building_block_input_channels: tuple[int, int, int, int], 375 output_dim: int, 376 activation_layer: nn.Module | None = nn.ReLU, 377 batch_normalisation: bool = True, 378 bias: bool = False, 379 ) -> None: 380 r"""Construct and initialise the ResNet backbone network. 381 382 **Args:** 383 - **input_channels** (`int`): the number of channels of input. Image data are kept channels when going in ResNet. Note that convolutional networks require number of input channels instead of dimension. 384 - **building_block_type** (`ResNetBlockSmall` | `ResNetBlockLarge`): the type of building block used in the ResNet. 385 - **building_block_nums** (`tuple[int, int, int, int]`): the number of building blocks in the 2-5 convolutional layer correspondingly. 386 - **building_block_preceding_output_channels** (`tuple[int, int, int, int]`): the number of channels of preceding output of each building block in the 2-5 convolutional layer correspondingly. 387 - **building_block_input_channels** (`tuple[int, int, int, int]`): the number of channels of input of each building block in the 2-5 convolutional layer correspondingly. 388 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 389 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 390 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 391 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 392 """ 393 CLBackbone.__init__(self, output_dim=output_dim) 394 395 self.batch_normalisation: bool = batch_normalisation 396 r"""Store whether to use batch normalisation after the fully-connected layers.""" 397 self.activation: bool = activation_layer is not None 398 r"""Store whether to use activation function after the fully-connected layers.""" 399 400 # construct the 1st weighted convolutional layer and attached layers (batchnorm, activation, etc) 401 layer_input_channels = input_channels # the input channels of the 1st convolutional layer, which receive the input of the entire network 402 layer_output_channels = 64 # the output channels of the 1st convolutional layer 403 self.conv1 = nn.Conv2d( 404 in_channels=layer_input_channels, 405 out_channels=layer_output_channels, 406 kernel_size=7, 407 stride=2, 408 padding=3, 409 bias=bias, 410 ) # construct the 1st weight convolutional layer of the entire ResNet 411 r"""The 1st weight convolutional layer of the entire ResNet. It is always with fixed kernel size 7x7, stride 2, and padding 3. """ 412 self.weighted_layer_names.append("conv1") # collect the layer name to be masked 413 if self.batch_normalisation: 414 self.conv_bn1 = nn.BatchNorm2d( 415 num_features=layer_output_channels 416 ) # construct the batch normalisation layer 417 r"""The batch normalisation (`nn.BatchNorm2d`) layer after the 1st weighted convolutional layer. """ 418 if self.activation: 419 self.conv_activation1 = activation_layer() # construct the activation layer 420 r"""The activation layer after the 1st weighted convolutional layer. """ 421 422 self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) # 423 r"""The max pooling layer which is laid in between 1st and 2nd convolutional layers with kernel size 3x3, stride 2. """ 424 425 # construct the 2nd convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc) 426 self.conv2x = self._multiple_blocks( 427 layer_name="conv2x", 428 building_block_type=building_block_type, 429 building_block_num=building_block_nums[0], 430 preceding_output_channels=building_block_preceding_output_channels[0], 431 input_channels=building_block_input_channels[0], 432 overall_stride=1, # the overall stride of the 2nd convolutional layer should be 1, as the preceding maxpooling layer has stride 2, which already made 112x112 -> 56x56. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details. 433 activation_layer=activation_layer, 434 batch_normalisation=batch_normalisation, 435 bias=bias, 436 ) 437 r"""The 2nd convolutional layer of the ResNet, which contains multiple blocks. """ 438 439 # construct the 3rd convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc) 440 self.conv3x = self._multiple_blocks( 441 layer_name="conv3x", 442 building_block_type=building_block_type, 443 building_block_num=building_block_nums[1], 444 preceding_output_channels=building_block_preceding_output_channels[1], 445 input_channels=building_block_input_channels[1], 446 overall_stride=2, # the overall stride of the 3rd convolutional layer should be 2, making 56x56 -> 28x28. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details. 447 activation_layer=activation_layer, 448 batch_normalisation=batch_normalisation, 449 bias=bias, 450 ) 451 r"""The 3rd convolutional layer of the ResNet, which contains multiple blocks. """ 452 453 # construct the 4th convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc) 454 self.conv4x = self._multiple_blocks( 455 layer_name="conv4x", 456 building_block_type=building_block_type, 457 building_block_num=building_block_nums[2], 458 preceding_output_channels=building_block_preceding_output_channels[2], 459 input_channels=building_block_input_channels[2], 460 overall_stride=2, # the overall stride of the 4th convolutional layer should be 2, making 28x28 -> 14x14. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details. 461 activation_layer=activation_layer, 462 batch_normalisation=batch_normalisation, 463 bias=bias, 464 ) 465 r"""The 4th convolutional layer of the ResNet, which contains multiple blocks. """ 466 467 # construct the 5th convolutional layer with multiple blocks, and its attached layers (batchnorm, activation, etc) 468 self.conv5x = self._multiple_blocks( 469 layer_name="conv5x", 470 building_block_type=building_block_type, 471 building_block_num=building_block_nums[3], 472 preceding_output_channels=building_block_preceding_output_channels[3], 473 input_channels=building_block_input_channels[3], 474 overall_stride=2, # the overall stride of the 2nd convolutional layer should be 2, making 14x14 -> 7x7. See Table 2 in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) for details. 475 activation_layer=activation_layer, 476 batch_normalisation=batch_normalisation, 477 bias=bias, 478 ) 479 r"""The 5th convolutional layer of the ResNet, which contains multiple blocks. """ 480 481 self.avepool = nn.AdaptiveAvgPool2d(output_size=(1, 1)) 482 r"""The average pooling layer which is laid after the convolutional layers and before feature maps are flattened. """
Construct and initialise the ResNet backbone network.
Args:
- input_channels (
int
): the number of channels of input. Image data are kept channels when going in ResNet. Note that convolutional networks require number of input channels instead of dimension. - building_block_type (
ResNetBlockSmall
|ResNetBlockLarge
): the type of building block used in the ResNet. - building_block_nums (
tuple[int, int, int, int]
): the number of building blocks in the 2-5 convolutional layer correspondingly. - building_block_preceding_output_channels (
tuple[int, int, int, int]
): the number of channels of preceding output of each building block in the 2-5 convolutional layer correspondingly. - building_block_input_channels (
tuple[int, int, int, int]
): the number of channels of input of each building block in the 2-5 convolutional layer correspondingly. - output_dim (
int
): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. - activation_layer (
nn.Module
): activation function of each layer (if notNone
), ifNone
this layer won't be used. Defaultnn.ReLU
. - batch_normalisation (
bool
): whether to use batch normalisation after the weight convolutional layers. DefaultTrue
, same as what the original ResNet paper does. - bias (
bool
): whether to use bias in the convolutional layer. DefaultFalse
, because batch normalisation are doing the similar thing with bias.
Store whether to use batch normalisation after the fully-connected layers.
The 1st weight convolutional layer of the entire ResNet. It is always with fixed kernel size 7x7, stride 2, and padding 3.
The max pooling layer which is laid in between 1st and 2nd convolutional layers with kernel size 3x3, stride 2.
The average pooling layer which is laid after the convolutional layers and before feature maps are flattened.
551 def forward( 552 self, input: Tensor, stage: str = None, task_id: int | None = None 553 ) -> tuple[Tensor, dict[str, Tensor]]: 554 r"""The forward pass for data. It is the same for all tasks. 555 556 **Args:** 557 - **input** (`Tensor`): the input tensor from data. 558 559 **Returns:** 560 - **output_feature** (`Tensor`): the output feature tensor to be passed into heads. This is the main target of backpropagation. 561 - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. 562 """ 563 batch_size = input.size(0) 564 hidden_features = {} 565 566 x = input 567 568 x = self.conv1(x) 569 if self.batch_normalisation: 570 x = self.conv_bn1(x) 571 if self.activation: 572 x = self.conv_activation1(x) 573 hidden_features["conv1"] = x 574 575 x = self.maxpool(x) 576 577 for block in self.conv2x: 578 x, hidden_features_block = block(x) 579 hidden_features.update(hidden_features_block) # store the hidden feature 580 for block in self.conv3x: 581 x, hidden_features_block = block(x) 582 hidden_features.update(hidden_features_block) # store the hidden feature 583 for block in self.conv4x: 584 x, hidden_features_block = block(x) 585 hidden_features.update(hidden_features_block) # store the hidden feature 586 for block in self.conv5x: 587 x, hidden_features_block = block(x) 588 hidden_features.update(hidden_features_block) # store the hidden feature 589 590 x = self.avepool(x) 591 592 output_feature = x.view(batch_size, -1) # flatten before going through heads 593 594 return output_feature, hidden_features
The forward pass for data. It is the same for all tasks.
Args:
- input (
Tensor
): the input tensor from data.
Returns:
- output_feature (
Tensor
): the output feature tensor to be passed into heads. This is the main target of backpropagation. - hidden_features (
dict[str, Tensor]
): the hidden features (after activation) in each weighted layer. Key (str
) is the weighted layer name, value (Tensor
) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes.
597class ResNet18(ResNetBase): 598 r"""ResNet-18 backbone network. 599 600 This is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 18 weight convolutional layers in total. See Table 1 in the paper for details. 601 """ 602 603 def __init__( 604 self, 605 input_channels: int, 606 output_dim: int, 607 activation_layer: nn.Module | None = nn.ReLU, 608 batch_normalisation: bool = True, 609 bias: bool = False, 610 ) -> None: 611 r"""Construct and initialise the ResNet-18 backbone network. 612 613 **Args:** 614 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 615 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 616 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 617 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 618 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 619 """ 620 ResNetBase.__init__( 621 self, 622 input_channels=input_channels, 623 building_block_type=ResNetBlockSmall, # use the smaller building block for ResNet-18 624 building_block_nums=(2, 2, 2, 2), 625 building_block_preceding_output_channels=(64, 64, 128, 256), 626 building_block_input_channels=(64, 128, 256, 512), 627 output_dim=output_dim, 628 activation_layer=activation_layer, 629 batch_normalisation=batch_normalisation, 630 bias=bias, 631 )
ResNet-18 backbone network.
This is a smaller architecture proposed in the original ResNet paper. It consists of 18 weight convolutional layers in total. See Table 1 in the paper for details.
603 def __init__( 604 self, 605 input_channels: int, 606 output_dim: int, 607 activation_layer: nn.Module | None = nn.ReLU, 608 batch_normalisation: bool = True, 609 bias: bool = False, 610 ) -> None: 611 r"""Construct and initialise the ResNet-18 backbone network. 612 613 **Args:** 614 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 615 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 616 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 617 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 618 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 619 """ 620 ResNetBase.__init__( 621 self, 622 input_channels=input_channels, 623 building_block_type=ResNetBlockSmall, # use the smaller building block for ResNet-18 624 building_block_nums=(2, 2, 2, 2), 625 building_block_preceding_output_channels=(64, 64, 128, 256), 626 building_block_input_channels=(64, 128, 256, 512), 627 output_dim=output_dim, 628 activation_layer=activation_layer, 629 batch_normalisation=batch_normalisation, 630 bias=bias, 631 )
Construct and initialise the ResNet-18 backbone network.
Args:
- input_channels (
int
): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. - output_dim (
int
): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. - activation_layer (
nn.Module
): activation function of each layer (if notNone
), ifNone
this layer won't be used. Defaultnn.ReLU
. - batch_normalisation (
bool
): whether to use batch normalisation after the weight convolutional layers. DefaultTrue
, same as what the original ResNet paper does. - bias (
bool
): whether to use bias in the convolutional layer. DefaultFalse
, because batch normalisation are doing the similar thing with bias.
Inherited Members
634class ResNet34(ResNetBase): 635 r"""ResNet-34 backbone network. 636 637 This is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 34 weight convolutional layers in total. See Table 1 in the paper for details. 638 """ 639 640 def __init__( 641 self, 642 input_channels: int, 643 output_dim: int, 644 activation_layer: nn.Module | None = nn.ReLU, 645 batch_normalisation: bool = True, 646 bias: bool = False, 647 ) -> None: 648 r"""Construct and initialise the ResNet-34 backbone network. 649 650 **Args:** 651 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 652 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 653 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 654 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 655 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 656 """ 657 ResNetBase.__init__( 658 self, 659 input_channels=input_channels, 660 building_block_type=ResNetBlockSmall, # use the smaller building block for ResNet-34 661 building_block_nums=(3, 4, 6, 3), 662 building_block_preceding_output_channels=(64, 64, 128, 256), 663 building_block_input_channels=(64, 128, 256, 512), 664 output_dim=output_dim, 665 activation_layer=activation_layer, 666 batch_normalisation=batch_normalisation, 667 bias=bias, 668 )
ResNet-34 backbone network.
This is a smaller architecture proposed in the original ResNet paper. It consists of 34 weight convolutional layers in total. See Table 1 in the paper for details.
640 def __init__( 641 self, 642 input_channels: int, 643 output_dim: int, 644 activation_layer: nn.Module | None = nn.ReLU, 645 batch_normalisation: bool = True, 646 bias: bool = False, 647 ) -> None: 648 r"""Construct and initialise the ResNet-34 backbone network. 649 650 **Args:** 651 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 652 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 653 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 654 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 655 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 656 """ 657 ResNetBase.__init__( 658 self, 659 input_channels=input_channels, 660 building_block_type=ResNetBlockSmall, # use the smaller building block for ResNet-34 661 building_block_nums=(3, 4, 6, 3), 662 building_block_preceding_output_channels=(64, 64, 128, 256), 663 building_block_input_channels=(64, 128, 256, 512), 664 output_dim=output_dim, 665 activation_layer=activation_layer, 666 batch_normalisation=batch_normalisation, 667 bias=bias, 668 )
Construct and initialise the ResNet-34 backbone network.
Args:
- input_channels (
int
): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. - output_dim (
int
): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. - activation_layer (
nn.Module
): activation function of each layer (if notNone
), ifNone
this layer won't be used. Defaultnn.ReLU
. - batch_normalisation (
bool
): whether to use batch normalisation after the weight convolutional layers. DefaultTrue
, same as what the original ResNet paper does. - bias (
bool
): whether to use bias in the convolutional layer. DefaultFalse
, because batch normalisation are doing the similar thing with bias.
Inherited Members
671class ResNet50(ResNetBase): 672 r"""ResNet-50 backbone network. 673 674 This is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 50 weight convolutional layers in total. See Table 1 in the paper for details. 675 """ 676 677 def __init__( 678 self, 679 input_channels: int, 680 output_dim: int, 681 activation_layer: nn.Module | None = nn.ReLU, 682 batch_normalisation: bool = True, 683 bias: bool = False, 684 ) -> None: 685 r"""Construct and initialise the ResNet-50 backbone network. 686 687 **Args:** 688 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 689 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 690 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 691 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 692 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 693 """ 694 ResNetBase.__init__( 695 self, 696 input_channels=input_channels, 697 building_block_type=ResNetBlockLarge, # use the larger building block for ResNet-50 698 building_block_nums=(3, 4, 6, 3), 699 building_block_preceding_output_channels=(64, 256, 512, 1024), 700 building_block_input_channels=(64, 128, 256, 512), 701 output_dim=output_dim, 702 activation_layer=activation_layer, 703 batch_normalisation=batch_normalisation, 704 bias=bias, 705 )
ResNet-50 backbone network.
This is a larger architecture proposed in the original ResNet paper. It consists of 50 weight convolutional layers in total. See Table 1 in the paper for details.
677 def __init__( 678 self, 679 input_channels: int, 680 output_dim: int, 681 activation_layer: nn.Module | None = nn.ReLU, 682 batch_normalisation: bool = True, 683 bias: bool = False, 684 ) -> None: 685 r"""Construct and initialise the ResNet-50 backbone network. 686 687 **Args:** 688 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 689 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 690 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 691 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 692 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 693 """ 694 ResNetBase.__init__( 695 self, 696 input_channels=input_channels, 697 building_block_type=ResNetBlockLarge, # use the larger building block for ResNet-50 698 building_block_nums=(3, 4, 6, 3), 699 building_block_preceding_output_channels=(64, 256, 512, 1024), 700 building_block_input_channels=(64, 128, 256, 512), 701 output_dim=output_dim, 702 activation_layer=activation_layer, 703 batch_normalisation=batch_normalisation, 704 bias=bias, 705 )
Construct and initialise the ResNet-50 backbone network.
Args:
- input_channels (
int
): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. - output_dim (
int
): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. - activation_layer (
nn.Module
): activation function of each layer (if notNone
), ifNone
this layer won't be used. Defaultnn.ReLU
. - batch_normalisation (
bool
): whether to use batch normalisation after the weight convolutional layers. DefaultTrue
, same as what the original ResNet paper does. - bias (
bool
): whether to use bias in the convolutional layer. DefaultFalse
, because batch normalisation are doing the similar thing with bias.
Inherited Members
708class ResNet101(ResNetBase): 709 r"""ResNet-101 backbone network. 710 711 This is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 101 weight convolutional layers in total. See Table 1 in the paper for details. 712 """ 713 714 def __init__( 715 self, 716 input_channels: int, 717 output_dim: int, 718 activation_layer: nn.Module | None = nn.ReLU, 719 batch_normalisation: bool = True, 720 bias: bool = False, 721 ) -> None: 722 r"""Construct and initialise the ResNet-101 backbone network. 723 724 **Args:** 725 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 726 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 727 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 728 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 729 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 730 """ 731 ResNetBase.__init__( 732 self, 733 input_channels=input_channels, 734 building_block_type=ResNetBlockLarge, # use the larger building block for ResNet-101 735 building_block_nums=(3, 4, 23, 3), 736 building_block_preceding_output_channels=(64, 256, 512, 1024), 737 building_block_input_channels=(64, 128, 256, 512), 738 output_dim=output_dim, 739 activation_layer=activation_layer, 740 batch_normalisation=batch_normalisation, 741 bias=bias, 742 )
ResNet-101 backbone network.
This is a larger architecture proposed in the original ResNet paper. It consists of 101 weight convolutional layers in total. See Table 1 in the paper for details.
714 def __init__( 715 self, 716 input_channels: int, 717 output_dim: int, 718 activation_layer: nn.Module | None = nn.ReLU, 719 batch_normalisation: bool = True, 720 bias: bool = False, 721 ) -> None: 722 r"""Construct and initialise the ResNet-101 backbone network. 723 724 **Args:** 725 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 726 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 727 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 728 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 729 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 730 """ 731 ResNetBase.__init__( 732 self, 733 input_channels=input_channels, 734 building_block_type=ResNetBlockLarge, # use the larger building block for ResNet-101 735 building_block_nums=(3, 4, 23, 3), 736 building_block_preceding_output_channels=(64, 256, 512, 1024), 737 building_block_input_channels=(64, 128, 256, 512), 738 output_dim=output_dim, 739 activation_layer=activation_layer, 740 batch_normalisation=batch_normalisation, 741 bias=bias, 742 )
Construct and initialise the ResNet-101 backbone network.
Args:
- input_channels (
int
): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. - output_dim (
int
): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. - activation_layer (
nn.Module
): activation function of each layer (if notNone
), ifNone
this layer won't be used. Defaultnn.ReLU
. - batch_normalisation (
bool
): whether to use batch normalisation after the weight convolutional layers. DefaultTrue
, same as what the original ResNet paper does. - bias (
bool
): whether to use bias in the convolutional layer. DefaultFalse
, because batch normalisation are doing the similar thing with bias.
Inherited Members
745class ResNet152(ResNetBase): 746 r"""ResNet-152 backbone network. 747 748 This is the largest architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 152 weight convolutional layers in total. See Table 1 in the paper for details. 749 """ 750 751 def __init__( 752 self, 753 input_channels: int, 754 output_dim: int, 755 activation_layer: nn.Module | None = nn.ReLU, 756 batch_normalisation: bool = True, 757 bias: bool = False, 758 ) -> None: 759 r"""Construct and initialise the ResNet-50 backbone network. 760 761 **Args:** 762 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 763 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 764 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 765 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 766 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 767 """ 768 ResNetBase.__init__( 769 self, 770 input_channels=input_channels, 771 building_block_type=ResNetBlockLarge, # use the larger building block for ResNet-152 772 building_block_nums=(3, 8, 36, 3), 773 building_block_preceding_output_channels=(64, 256, 512, 1024), 774 building_block_input_channels=(64, 128, 256, 512), 775 output_dim=output_dim, 776 activation_layer=activation_layer, 777 batch_normalisation=batch_normalisation, 778 bias=bias, 779 )
ResNet-152 backbone network.
This is the largest architecture proposed in the original ResNet paper. It consists of 152 weight convolutional layers in total. See Table 1 in the paper for details.
751 def __init__( 752 self, 753 input_channels: int, 754 output_dim: int, 755 activation_layer: nn.Module | None = nn.ReLU, 756 batch_normalisation: bool = True, 757 bias: bool = False, 758 ) -> None: 759 r"""Construct and initialise the ResNet-50 backbone network. 760 761 **Args:** 762 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 763 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 764 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 765 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. Default `True`, same as what the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) does. 766 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 767 """ 768 ResNetBase.__init__( 769 self, 770 input_channels=input_channels, 771 building_block_type=ResNetBlockLarge, # use the larger building block for ResNet-152 772 building_block_nums=(3, 8, 36, 3), 773 building_block_preceding_output_channels=(64, 256, 512, 1024), 774 building_block_input_channels=(64, 128, 256, 512), 775 output_dim=output_dim, 776 activation_layer=activation_layer, 777 batch_normalisation=batch_normalisation, 778 bias=bias, 779 )
Construct and initialise the ResNet-50 backbone network.
Args:
- input_channels (
int
): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. - output_dim (
int
): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. - activation_layer (
nn.Module
): activation function of each layer (if notNone
), ifNone
this layer won't be used. Defaultnn.ReLU
. - batch_normalisation (
bool
): whether to use batch normalisation after the weight convolutional layers. DefaultTrue
, same as what the original ResNet paper does. - bias (
bool
): whether to use bias in the convolutional layer. DefaultFalse
, because batch normalisation are doing the similar thing with bias.
Inherited Members
782class HATMaskResNetBlockSmall(HATMaskBackbone, ResNetBlockSmall): 783 r"""The smaller building block for HAT masked ResNet-18/34. 784 785 It consists of 2 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (left) in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). 786 787 Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function. 788 """ 789 790 def __init__( 791 self, 792 outer_layer_name: str, 793 block_idx: int, 794 preceding_output_channels: int, 795 input_channels: int, 796 overall_stride: int, 797 gate: str, 798 activation_layer: nn.Module | None = nn.ReLU, 799 bias: bool = False, 800 ) -> None: 801 r"""Construct and initialise the smaller building block with task embedding. 802 803 **Args:** 804 - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer. 805 - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer. 806 - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block. 807 - **input_channels** (`int`): the number of channels of input of this building block. 808 - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (last) convolutional layer where the 1st convolutional layer remain stride of 1. 809 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 810 - `sigmoid`: the sigmoid function. 811 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 812 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 813 """ 814 HATMaskBackbone.__init__(self, output_dim=None, gate=gate) 815 ResNetBlockSmall.__init__( 816 self, 817 outer_layer_name=outer_layer_name, 818 block_idx=block_idx, 819 preceding_output_channels=preceding_output_channels, 820 input_channels=input_channels, 821 overall_stride=overall_stride, 822 activation_layer=activation_layer, 823 bias=bias, 824 ) 825 self.register_hat_mask_module_explicitly(gate=gate) 826 827 # construct the task embedding over the 1st weighted convolutional layer. It is channel-wise 828 layer_output_channels = ( 829 input_channels # the output channels of the 1st convolutional layer 830 ) 831 self.task_embedding_t[self.full_1st_layer_name] = nn.Embedding( 832 num_embeddings=1, embedding_dim=layer_output_channels 833 ) 834 835 # construct the task embedding over the 2nd weighted convolutional layer. It is channel-wise 836 layer_output_channels = ( 837 input_channels * 1 838 ) # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion) 839 self.task_embedding_t[self.full_2nd_layer_name] = nn.Embedding( 840 num_embeddings=1, embedding_dim=layer_output_channels 841 ) 842 843 def forward( 844 self, 845 input: Tensor, 846 stage: str, 847 s_max: float | None = None, 848 batch_idx: int | None = None, 849 num_batches: int | None = None, 850 test_mask: dict[str, Tensor] | None = None, 851 ) -> tuple[Tensor, dict[str, Tensor], dict[str, Tensor]]: 852 r"""The forward pass for data from task `task_id`. Task-specific mask for `task_id` are applied to the units which are channels in each weighted convolutional layer. 853 854 **Args:** 855 - **input** (`Tensor`): The input tensor from data. 856 - **stage** (`str`): the stage of the forward pass, should be one of the following: 857 1. 'train': training stage. 858 2. 'validation': validation stage. 859 3. 'test': testing stage. 860 - **s_max** (`float` | `None`): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in [HAT paper](http://proceedings.mlr.press/v80/serra18a). 861 - **batch_idx** (`int` | `None`): the current batch index. Applies only to training stage. For other stages, it is default `None`. 862 - **num_batches** (`int` | `None`): the total number of batches. Applies only to training stage. For other stages, it is default `None`. 863 - **test_mask** (`dict[str, Tensor]` | `None`): the binary mask used for test. Applies only to testing stage. For other stages, it is default `None`. 864 865 **Returns:** 866 - **output_feature** (`Tensor`): the output feature maps. 867 - **mask** (`dict[str, Tensor]`): the mask for the current task. Key (`str`) is layer name, value (`Tensor`) is the mask tensor. The mask tensor has size (number of units). 868 - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited this `forward()` method of `HAT` class. 869 """ 870 hidden_features = {} 871 872 # get the mask for the current task from the task embedding in this stage 873 mask = self.get_mask( 874 stage=stage, 875 s_max=s_max, 876 batch_idx=batch_idx, 877 num_batches=num_batches, 878 test_mask=test_mask, 879 ) 880 881 identity = ( 882 self.identity_downsample(input) 883 if self.identity_downsample is not None 884 else input 885 ) # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's 886 887 x = input 888 x = self.conv1(x) # weighted convolutional layer first 889 x = x * ( 890 mask[self.full_1st_layer_name].view(1, -1, 1, 1) 891 ) # apply the mask to the 1st convolutional layer. Broadcast the dimension of mask to match the input 892 if self.activation: 893 x = self.conv_activation1(x) # activation function third 894 hidden_features[self.full_1st_layer_name] = x # store the hidden feature 895 896 x = self.conv2(x) # weighted convolutional layer first 897 x = x + identity 898 x = x * ( 899 mask[self.full_2nd_layer_name].view(1, -1, 1, 1) 900 ) # apply the mask to the 2nd convolutional layer after the shortcut connection. Broadcast the dimension of mask to match the input 901 if self.activation: 902 x = self.conv_activation2(x) # activation after the shortcut connection 903 hidden_features[self.full_2nd_layer_name] = x # store the hidden feature 904 905 output_feature = x 906 907 return output_feature, mask, hidden_features
The smaller building block for HAT masked ResNet-18/34.
It consists of 2 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (left) in the original ResNet paper.
Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
790 def __init__( 791 self, 792 outer_layer_name: str, 793 block_idx: int, 794 preceding_output_channels: int, 795 input_channels: int, 796 overall_stride: int, 797 gate: str, 798 activation_layer: nn.Module | None = nn.ReLU, 799 bias: bool = False, 800 ) -> None: 801 r"""Construct and initialise the smaller building block with task embedding. 802 803 **Args:** 804 - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer. 805 - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer. 806 - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block. 807 - **input_channels** (`int`): the number of channels of input of this building block. 808 - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (last) convolutional layer where the 1st convolutional layer remain stride of 1. 809 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 810 - `sigmoid`: the sigmoid function. 811 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 812 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 813 """ 814 HATMaskBackbone.__init__(self, output_dim=None, gate=gate) 815 ResNetBlockSmall.__init__( 816 self, 817 outer_layer_name=outer_layer_name, 818 block_idx=block_idx, 819 preceding_output_channels=preceding_output_channels, 820 input_channels=input_channels, 821 overall_stride=overall_stride, 822 activation_layer=activation_layer, 823 bias=bias, 824 ) 825 self.register_hat_mask_module_explicitly(gate=gate) 826 827 # construct the task embedding over the 1st weighted convolutional layer. It is channel-wise 828 layer_output_channels = ( 829 input_channels # the output channels of the 1st convolutional layer 830 ) 831 self.task_embedding_t[self.full_1st_layer_name] = nn.Embedding( 832 num_embeddings=1, embedding_dim=layer_output_channels 833 ) 834 835 # construct the task embedding over the 2nd weighted convolutional layer. It is channel-wise 836 layer_output_channels = ( 837 input_channels * 1 838 ) # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion) 839 self.task_embedding_t[self.full_2nd_layer_name] = nn.Embedding( 840 num_embeddings=1, embedding_dim=layer_output_channels 841 )
Construct and initialise the smaller building block with task embedding.
Args:
- outer_layer_name (
str
): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer. - block_idx (
int
): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer. - preceding_output_channels (
int
): the number of channels of preceding output of this particular building block. - input_channels (
int
): the number of channels of input of this building block. - overall_stride (
int
): the overall stride of this building block. This stride is performed at 2nd (last) convolutional layer where the 1st convolutional layer remain stride of 1. - gate (
str
): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:sigmoid
: the sigmoid function.
- activation_layer (
nn.Module
): activation function of each layer (if notNone
), ifNone
this layer won't be used. Defaultnn.ReLU
. - bias (
bool
): whether to use bias in the convolutional layer. DefaultFalse
.
843 def forward( 844 self, 845 input: Tensor, 846 stage: str, 847 s_max: float | None = None, 848 batch_idx: int | None = None, 849 num_batches: int | None = None, 850 test_mask: dict[str, Tensor] | None = None, 851 ) -> tuple[Tensor, dict[str, Tensor], dict[str, Tensor]]: 852 r"""The forward pass for data from task `task_id`. Task-specific mask for `task_id` are applied to the units which are channels in each weighted convolutional layer. 853 854 **Args:** 855 - **input** (`Tensor`): The input tensor from data. 856 - **stage** (`str`): the stage of the forward pass, should be one of the following: 857 1. 'train': training stage. 858 2. 'validation': validation stage. 859 3. 'test': testing stage. 860 - **s_max** (`float` | `None`): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in [HAT paper](http://proceedings.mlr.press/v80/serra18a). 861 - **batch_idx** (`int` | `None`): the current batch index. Applies only to training stage. For other stages, it is default `None`. 862 - **num_batches** (`int` | `None`): the total number of batches. Applies only to training stage. For other stages, it is default `None`. 863 - **test_mask** (`dict[str, Tensor]` | `None`): the binary mask used for test. Applies only to testing stage. For other stages, it is default `None`. 864 865 **Returns:** 866 - **output_feature** (`Tensor`): the output feature maps. 867 - **mask** (`dict[str, Tensor]`): the mask for the current task. Key (`str`) is layer name, value (`Tensor`) is the mask tensor. The mask tensor has size (number of units). 868 - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited this `forward()` method of `HAT` class. 869 """ 870 hidden_features = {} 871 872 # get the mask for the current task from the task embedding in this stage 873 mask = self.get_mask( 874 stage=stage, 875 s_max=s_max, 876 batch_idx=batch_idx, 877 num_batches=num_batches, 878 test_mask=test_mask, 879 ) 880 881 identity = ( 882 self.identity_downsample(input) 883 if self.identity_downsample is not None 884 else input 885 ) # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's 886 887 x = input 888 x = self.conv1(x) # weighted convolutional layer first 889 x = x * ( 890 mask[self.full_1st_layer_name].view(1, -1, 1, 1) 891 ) # apply the mask to the 1st convolutional layer. Broadcast the dimension of mask to match the input 892 if self.activation: 893 x = self.conv_activation1(x) # activation function third 894 hidden_features[self.full_1st_layer_name] = x # store the hidden feature 895 896 x = self.conv2(x) # weighted convolutional layer first 897 x = x + identity 898 x = x * ( 899 mask[self.full_2nd_layer_name].view(1, -1, 1, 1) 900 ) # apply the mask to the 2nd convolutional layer after the shortcut connection. Broadcast the dimension of mask to match the input 901 if self.activation: 902 x = self.conv_activation2(x) # activation after the shortcut connection 903 hidden_features[self.full_2nd_layer_name] = x # store the hidden feature 904 905 output_feature = x 906 907 return output_feature, mask, hidden_features
The forward pass for data from task task_id
. Task-specific mask for task_id
are applied to the units which are channels in each weighted convolutional layer.
Args:
- input (
Tensor
): The input tensor from data. - stage (
str
): the stage of the forward pass, should be one of the following:- 'train': training stage.
- 'validation': validation stage.
- 'test': testing stage.
- s_max (
float
|None
): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in HAT paper. - batch_idx (
int
|None
): the current batch index. Applies only to training stage. For other stages, it is defaultNone
. - num_batches (
int
|None
): the total number of batches. Applies only to training stage. For other stages, it is defaultNone
. - test_mask (
dict[str, Tensor]
|None
): the binary mask used for test. Applies only to testing stage. For other stages, it is defaultNone
.
Returns:
- output_feature (
Tensor
): the output feature maps. - mask (
dict[str, Tensor]
): the mask for the current task. Key (str
) is layer name, value (Tensor
) is the mask tensor. The mask tensor has size (number of units). - hidden_features (
dict[str, Tensor]
): the hidden features (after activation) in each weighted layer. Key (str
) is the weighted layer name, value (Tensor
) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited thisforward()
method ofHAT
class.
910class HATMaskResNetBlockLarge(HATMaskBackbone, ResNetBlockLarge): 911 r"""The larger building block for ResNet-50/101/152. It is referred to "bottleneck" building block in the ResNet paper. 912 913 It consists of 3 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (right) in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). 914 915 Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function. 916 """ 917 918 def __init__( 919 self, 920 outer_layer_name: str, 921 block_idx: int, 922 preceding_output_channels: int, 923 input_channels: int, 924 overall_stride: int, 925 gate: str, 926 activation_layer: nn.Module | None = nn.ReLU, 927 bias: bool = False, 928 ) -> None: 929 r"""Construct and initialise the larger building block with task embedding. 930 931 **Args:** 932 - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer. 933 - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer. 934 - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block. 935 - **input_channels** (`int`): the number of channels of input of this building block. 936 - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (middle) convolutional layer where 1st and 3rd convolutional layers remain stride of 1. 937 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 938 - `sigmoid`: the sigmoid function. 939 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 940 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 941 """ 942 HATMaskBackbone.__init__(self, output_dim=None, gate=gate) 943 ResNetBlockLarge.__init__( 944 self, 945 outer_layer_name=outer_layer_name, 946 block_idx=block_idx, 947 preceding_output_channels=preceding_output_channels, 948 input_channels=input_channels, 949 overall_stride=overall_stride, 950 activation_layer=activation_layer, 951 bias=bias, 952 ) 953 self.register_hat_mask_module_explicitly(gate=gate) 954 955 # construct the task embedding over the 1st weighted convolutional layer. It is channel-wise 956 layer_output_channels = ( 957 input_channels # the output channels of the 1st convolutional layer 958 ) 959 self.task_embedding_t[self.full_1st_layer_name] = nn.Embedding( 960 num_embeddings=1, embedding_dim=layer_output_channels 961 ) 962 963 # construct the task embedding over the 2nd weighted convolutional layer. It is channel-wise 964 layer_output_channels = ( 965 input_channels * 1 966 ) # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion) 967 self.task_embedding_t[self.full_2nd_layer_name] = nn.Embedding( 968 num_embeddings=1, embedding_dim=layer_output_channels 969 ) 970 971 # construct the task embedding over the 3rd weighted convolutional layer. It is channel-wise 972 layer_output_channels = ( 973 input_channels 974 * 4 # the output channels of the 2nd convolutional layer, which is 4 times expanded as the input channels 975 ) 976 self.task_embedding_t[self.full_3rd_layer_name] = nn.Embedding( 977 num_embeddings=1, embedding_dim=layer_output_channels 978 ) 979 980 def forward( 981 self, 982 input: Tensor, 983 stage: str, 984 s_max: float | None = None, 985 batch_idx: int | None = None, 986 num_batches: int | None = None, 987 test_mask: dict[str, Tensor] | None = None, 988 ) -> tuple[Tensor, dict[str, Tensor], dict[str, Tensor]]: 989 r"""The forward pass for data from task `task_id`. Task-specific mask for `task_id` are applied to the units which are channels in each weighted convolutional layer. 990 991 **Args:** 992 - **input** (`Tensor`): The input tensor from data. 993 - **stage** (`str`): the stage of the forward pass, should be one of the following: 994 1. 'train': training stage. 995 2. 'validation': validation stage. 996 3. 'test': testing stage. 997 - **s_max** (`float` | `None`): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in [HAT paper](http://proceedings.mlr.press/v80/serra18a). 998 - **batch_idx** (`int` | `None`): the current batch index. Applies only to training stage. For other stages, it is default `None`. 999 - **num_batches** (`int` | `None`): the total number of batches. Applies only to training stage. For other stages, it is default `None`. 1000 - **test_mask** (`dict[str, Tensor]` | `None`): the binary mask used for test. Applies only to testing stage. For other stages, it is default `None`. 1001 1002 **Returns:** 1003 - **output_feature** (`Tensor`): the output feature maps. 1004 - **mask** (`dict[str, Tensor]`): the mask for the current task. Key (`str`) is layer name, value (`Tensor`) is the mask tensor. The mask tensor has size (number of units). 1005 - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited this `forward()` method of `HAT` class. 1006 """ 1007 hidden_features = {} 1008 1009 # get the mask for the current task from the task embedding in this stage 1010 mask = self.get_mask( 1011 stage=stage, 1012 s_max=s_max, 1013 batch_idx=batch_idx, 1014 num_batches=num_batches, 1015 test_mask=test_mask, 1016 ) 1017 1018 identity = ( 1019 self.identity_downsample(input) 1020 if self.identity_downsample is not None 1021 else input 1022 ) # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's 1023 1024 x = input 1025 x = self.conv1(x) # weighted convolutional layer first 1026 x = x * ( 1027 mask[self.full_1st_layer_name].view(1, -1, 1, 1) 1028 ) # apply the mask to the 1st convolutional layer. Broadcast the dimension of mask to match the input 1029 if self.activation: 1030 x = self.conv_activation1(x) # activation function third 1031 hidden_features[self.full_1st_layer_name] = x # store the hidden feature 1032 1033 x = self.conv2(x) # weighted convolutional layer first 1034 x = x * ( 1035 mask[self.full_2nd_layer_name].view(1, -1, 1, 1) 1036 ) # apply the mask to the 2nd convolutional layer. Broadcast the dimension of mask to match the input 1037 if self.activation: 1038 x = self.conv_activation2(x) # activation function third 1039 hidden_features[self.full_2nd_layer_name] = x # store the hidden feature 1040 1041 x = self.conv3(x) # weighted convolutional layer first 1042 x = x + identity 1043 x = x * ( 1044 mask[self.full_3rd_layer_name].view(1, -1, 1, 1) 1045 ) # apply the mask to the 3rd convolutional layer after the shortcut connection. Broadcast the dimension of mask to match the input 1046 if self.activation: 1047 x = self.activation3(x) # activation after the shortcut connection 1048 hidden_features[self.full_3rd_layer_name] = x # store the hidden feature 1049 1050 output_feature = x 1051 1052 return output_feature, mask, hidden_features
The larger building block for ResNet-50/101/152. It is referred to "bottleneck" building block in the ResNet paper.
It consists of 3 weight convolutional layers, each followed by an activation function. See Table 1 or Figure 5 (right) in the original ResNet paper.
Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
918 def __init__( 919 self, 920 outer_layer_name: str, 921 block_idx: int, 922 preceding_output_channels: int, 923 input_channels: int, 924 overall_stride: int, 925 gate: str, 926 activation_layer: nn.Module | None = nn.ReLU, 927 bias: bool = False, 928 ) -> None: 929 r"""Construct and initialise the larger building block with task embedding. 930 931 **Args:** 932 - **outer_layer_name** (`str`): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer. 933 - **block_idx** (`int`): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer. 934 - **preceding_output_channels** (`int`): the number of channels of preceding output of this particular building block. 935 - **input_channels** (`int`): the number of channels of input of this building block. 936 - **overall_stride** (`int`): the overall stride of this building block. This stride is performed at 2nd (middle) convolutional layer where 1st and 3rd convolutional layers remain stride of 1. 937 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 938 - `sigmoid`: the sigmoid function. 939 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 940 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 941 """ 942 HATMaskBackbone.__init__(self, output_dim=None, gate=gate) 943 ResNetBlockLarge.__init__( 944 self, 945 outer_layer_name=outer_layer_name, 946 block_idx=block_idx, 947 preceding_output_channels=preceding_output_channels, 948 input_channels=input_channels, 949 overall_stride=overall_stride, 950 activation_layer=activation_layer, 951 bias=bias, 952 ) 953 self.register_hat_mask_module_explicitly(gate=gate) 954 955 # construct the task embedding over the 1st weighted convolutional layer. It is channel-wise 956 layer_output_channels = ( 957 input_channels # the output channels of the 1st convolutional layer 958 ) 959 self.task_embedding_t[self.full_1st_layer_name] = nn.Embedding( 960 num_embeddings=1, embedding_dim=layer_output_channels 961 ) 962 963 # construct the task embedding over the 2nd weighted convolutional layer. It is channel-wise 964 layer_output_channels = ( 965 input_channels * 1 966 ) # the output channels of the 2nd convolutional layer, which is the same as the input channels (without expansion) 967 self.task_embedding_t[self.full_2nd_layer_name] = nn.Embedding( 968 num_embeddings=1, embedding_dim=layer_output_channels 969 ) 970 971 # construct the task embedding over the 3rd weighted convolutional layer. It is channel-wise 972 layer_output_channels = ( 973 input_channels 974 * 4 # the output channels of the 2nd convolutional layer, which is 4 times expanded as the input channels 975 ) 976 self.task_embedding_t[self.full_3rd_layer_name] = nn.Embedding( 977 num_embeddings=1, embedding_dim=layer_output_channels 978 )
Construct and initialise the larger building block with task embedding.
Args:
- outer_layer_name (
str
): pass the name of the multi-building-block layer that contains this building block to construct the full name of each weighted convolutional layer. - block_idx (
int
): the index of the building blocks in the multi-building-block layer to construct the full name of each weighted convolutional layer. - preceding_output_channels (
int
): the number of channels of preceding output of this particular building block. - input_channels (
int
): the number of channels of input of this building block. - overall_stride (
int
): the overall stride of this building block. This stride is performed at 2nd (middle) convolutional layer where 1st and 3rd convolutional layers remain stride of 1. - gate (
str
): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:sigmoid
: the sigmoid function.
- activation_layer (
nn.Module
): activation function of each layer (if notNone
), ifNone
this layer won't be used. Defaultnn.ReLU
. - bias (
bool
): whether to use bias in the convolutional layer. DefaultFalse
.
980 def forward( 981 self, 982 input: Tensor, 983 stage: str, 984 s_max: float | None = None, 985 batch_idx: int | None = None, 986 num_batches: int | None = None, 987 test_mask: dict[str, Tensor] | None = None, 988 ) -> tuple[Tensor, dict[str, Tensor], dict[str, Tensor]]: 989 r"""The forward pass for data from task `task_id`. Task-specific mask for `task_id` are applied to the units which are channels in each weighted convolutional layer. 990 991 **Args:** 992 - **input** (`Tensor`): The input tensor from data. 993 - **stage** (`str`): the stage of the forward pass, should be one of the following: 994 1. 'train': training stage. 995 2. 'validation': validation stage. 996 3. 'test': testing stage. 997 - **s_max** (`float` | `None`): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in [HAT paper](http://proceedings.mlr.press/v80/serra18a). 998 - **batch_idx** (`int` | `None`): the current batch index. Applies only to training stage. For other stages, it is default `None`. 999 - **num_batches** (`int` | `None`): the total number of batches. Applies only to training stage. For other stages, it is default `None`. 1000 - **test_mask** (`dict[str, Tensor]` | `None`): the binary mask used for test. Applies only to testing stage. For other stages, it is default `None`. 1001 1002 **Returns:** 1003 - **output_feature** (`Tensor`): the output feature maps. 1004 - **mask** (`dict[str, Tensor]`): the mask for the current task. Key (`str`) is layer name, value (`Tensor`) is the mask tensor. The mask tensor has size (number of units). 1005 - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited this `forward()` method of `HAT` class. 1006 """ 1007 hidden_features = {} 1008 1009 # get the mask for the current task from the task embedding in this stage 1010 mask = self.get_mask( 1011 stage=stage, 1012 s_max=s_max, 1013 batch_idx=batch_idx, 1014 num_batches=num_batches, 1015 test_mask=test_mask, 1016 ) 1017 1018 identity = ( 1019 self.identity_downsample(input) 1020 if self.identity_downsample is not None 1021 else input 1022 ) # remember the identity of input for the shortcut connection. Perform downsampling if its dimension doesn't match the output's 1023 1024 x = input 1025 x = self.conv1(x) # weighted convolutional layer first 1026 x = x * ( 1027 mask[self.full_1st_layer_name].view(1, -1, 1, 1) 1028 ) # apply the mask to the 1st convolutional layer. Broadcast the dimension of mask to match the input 1029 if self.activation: 1030 x = self.conv_activation1(x) # activation function third 1031 hidden_features[self.full_1st_layer_name] = x # store the hidden feature 1032 1033 x = self.conv2(x) # weighted convolutional layer first 1034 x = x * ( 1035 mask[self.full_2nd_layer_name].view(1, -1, 1, 1) 1036 ) # apply the mask to the 2nd convolutional layer. Broadcast the dimension of mask to match the input 1037 if self.activation: 1038 x = self.conv_activation2(x) # activation function third 1039 hidden_features[self.full_2nd_layer_name] = x # store the hidden feature 1040 1041 x = self.conv3(x) # weighted convolutional layer first 1042 x = x + identity 1043 x = x * ( 1044 mask[self.full_3rd_layer_name].view(1, -1, 1, 1) 1045 ) # apply the mask to the 3rd convolutional layer after the shortcut connection. Broadcast the dimension of mask to match the input 1046 if self.activation: 1047 x = self.activation3(x) # activation after the shortcut connection 1048 hidden_features[self.full_3rd_layer_name] = x # store the hidden feature 1049 1050 output_feature = x 1051 1052 return output_feature, mask, hidden_features
The forward pass for data from task task_id
. Task-specific mask for task_id
are applied to the units which are channels in each weighted convolutional layer.
Args:
- input (
Tensor
): The input tensor from data. - stage (
str
): the stage of the forward pass, should be one of the following:- 'train': training stage.
- 'validation': validation stage.
- 'test': testing stage.
- s_max (
float
|None
): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in HAT paper. - batch_idx (
int
|None
): the current batch index. Applies only to training stage. For other stages, it is defaultNone
. - num_batches (
int
|None
): the total number of batches. Applies only to training stage. For other stages, it is defaultNone
. - test_mask (
dict[str, Tensor]
|None
): the binary mask used for test. Applies only to testing stage. For other stages, it is defaultNone
.
Returns:
- output_feature (
Tensor
): the output feature maps. - mask (
dict[str, Tensor]
): the mask for the current task. Key (str
) is layer name, value (Tensor
) is the mask tensor. The mask tensor has size (number of units). - hidden_features (
dict[str, Tensor]
): the hidden features (after activation) in each weighted layer. Key (str
) is the weighted layer name, value (Tensor
) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited thisforward()
method ofHAT
class.
1055class HATMaskResNetBase(ResNetBase, HATMaskBackbone): 1056 r"""The base class of HAT masked [residual network (ResNet)](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). 1057 1058 [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters. 1059 1060 ResNet is a convolutional network architecture, which has 1st convolutional parameter layer and a maxpooling layer, connecting to 4 convolutional layers which contains multiple convolutional parameter layer. Each layer of the 4 are constructed from basic building blocks which are either small (`ResNetBlockSmall`) or large (`ResNetBlockLarge`). Each building block contains several convolutional parameter layers. The building blocks are connected by a skip connection which is a direct connection from the input of the block to the output of the block, and this is why it's called residual (find "shortcut connections" in the paper for more details). After the 5th convolutional layer, there are average pooling layer and a fully connected layer which connects to the CL output heads. 1061 1062 Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function. 1063 """ 1064 1065 def __init__( 1066 self, 1067 input_channels: int, 1068 building_block_type: HATMaskResNetBlockSmall | HATMaskResNetBlockLarge, 1069 building_block_nums: tuple[int, int, int, int], 1070 building_block_preceding_output_channels: tuple[int, int, int, int], 1071 building_block_input_channels: tuple[int, int, int, int], 1072 output_dim: int, 1073 gate: str, 1074 activation_layer: nn.Module | None = nn.ReLU, 1075 bias: bool = False, 1076 ) -> None: 1077 r"""Construct and initialise the HAT masked ResNet backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism. 1078 1079 **Args:** 1080 - **input_channels** (`int`): the number of channels of input. Image data are kept channels when going in ResNet. Note that convolutional networks require number of input channels instead of dimension. 1081 - **building_block_type** (`HATMaskResNetBlockSmall` | `HATMaskResNetBlockLarge`): the type of building block used in the ResNet. 1082 - **building_block_nums** (`tuple[int, int, int, int]`): the number of building blocks in the 2-5 convolutional layer correspondingly. 1083 - **building_block_preceding_output_channels** (`tuple[int, int, int, int]`): the number of channels of preceding output of each building block in the 2-5 convolutional layer correspondingly. 1084 - **building_block_input_channels** (`tuple[int, int, int, int]`): the number of channels of input of each building block in the 2-5 convolutional layer correspondingly. 1085 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 1086 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 1087 - `sigmoid`: the sigmoid function. 1088 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 1089 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 1090 """ 1091 # init from both inherited classes 1092 HATMaskBackbone.__init__(self, output_dim=output_dim, gate=gate) 1093 ResNetBase.__init__( 1094 self, 1095 input_channels=input_channels, 1096 building_block_type=building_block_type, 1097 building_block_nums=building_block_nums, 1098 building_block_preceding_output_channels=building_block_preceding_output_channels, 1099 building_block_input_channels=building_block_input_channels, 1100 output_dim=output_dim, 1101 activation_layer=activation_layer, 1102 batch_normalisation=False, # batch normalisation is incompatible with HAT mechanism 1103 bias=bias, 1104 ) 1105 self.register_hat_mask_module_explicitly( 1106 gate=gate 1107 ) # register all `nn.Module`s for HATMaskBackbone explicitly because the second `__init__()` wipes out them inited by the first `__init__()` 1108 self.update_multiple_blocks_task_embedding() 1109 1110 # construct the task embedding over the 1st weighted convolutional layers. It is channel-wise 1111 layer_output_channels = 64 # the output channels of the 1st convolutional layer 1112 self.task_embedding_t["conv1"] = nn.Embedding( 1113 num_embeddings=1, embedding_dim=layer_output_channels 1114 ) 1115 1116 def _multiple_blocks( 1117 self, 1118 layer_name: str, 1119 building_block_type: HATMaskResNetBlockSmall | HATMaskResNetBlockLarge, 1120 building_block_num: int, 1121 preceding_output_channels: int, 1122 input_channels: int, 1123 overall_stride: int, 1124 activation_layer: nn.Module | None = nn.ReLU, 1125 batch_normalisation: bool = False, 1126 bias: bool = False, 1127 ) -> None: 1128 r"""Construct a layer consisting of multiple building blocks with task embedding. It's used to construct the 2-5 convolutional layers of the HAT masked ResNet. 1129 1130 The "shortcut connections" are performed between the input and output of each building block: 1131 1. If the input and output of the building block have exactly the same dimensions (including number of channels and size), add the input to the output. 1132 2. If the input and output of the building block have different dimensions (including number of channels and size), add the input to the output after a convolutional layer to make the dimensions match. 1133 1134 **Args:** 1135 - **layer_name** (`str`): pass the name of this multi-building-block layer to construct the full name of each weighted convolutional layer. 1136 - **building_block_type** (`HATMaskResNetBlockSmall` | `HATMaskResNetBlockLarge`): the type of the building block. 1137 - **building_block_num** (`int`): the number of building blocks in this multi-building-block layer. 1138 - **preceding_output_channels** (`int`): the number of channels of preceding output of this entire multi-building-block layer. 1139 - **input_channels** (`int`): the number of channels of input of this multi-building-block layer. 1140 - **overall_stride** (`int`): the overall stride of the building blocks. This stride is performed at the 1st building block where other building blocks remain their own overall stride of 1. Inside that building block, this stride is performed at certain convolutional layer in the building block where other convolutional layers remain stride of 1: 1141 - For `ResNetBlockSmall`, it performs at the 2nd (last) layer. 1142 - For `ResNetBlockLarge`, it performs at the 2nd (middle) layer. 1143 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 1144 - **batch_normalisation** (`bool`): whether to use batch normalisation after the weight convolutional layers. In HATMaskResNet, batch normalisation is incompatible with HAT mechanism and shoule be always set `False`. We include this argument for compatibility with the original ResNet API. 1145 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 1146 1147 **Returns:** 1148 - **layer** (`nn.Sequential`): the constructed layer consisting of multiple building blocks. 1149 """ 1150 1151 layer = [] 1152 1153 for block_idx in range(building_block_num): 1154 layer.append( 1155 building_block_type( 1156 outer_layer_name=layer_name, 1157 block_idx=block_idx, 1158 preceding_output_channels=( 1159 preceding_output_channels 1160 if block_idx == 0 1161 else ( 1162 input_channels 1163 if building_block_type == HATMaskResNetBlockSmall 1164 else input_channels * 4 1165 ) 1166 ), # if it's the 1st block in this multi-building-block layer, it should be the number of channels of the preceding output of this entire multi-building-block layer. Otherwise, it should be the number of channels from last building block where the number of channels is 4 times of the input channels for `ResNetBlockLarge` than `ResNetBlockSmall`. 1167 input_channels=input_channels, 1168 overall_stride=( 1169 overall_stride if block_idx == 0 else 1 1170 ), # only perform the overall stride at the 1st block in this multi-building-block layer 1171 gate=self.gate, 1172 # no batch normalisation in HAT masked blocks 1173 activation_layer=activation_layer, 1174 bias=bias, 1175 ) 1176 ) 1177 1178 self.weighted_layer_names += layer[ 1179 -1 1180 ].weighted_layer_names # collect the weighted layer names in the blocks and sync to the weighted layer names list in the outer network 1181 1182 return nn.Sequential(*layer) 1183 1184 def update_multiple_blocks_task_embedding(self) -> None: 1185 r"""Collect the task embeddings in the multiple building blocks (2-5 convolutional layers) and sync to the weighted layer names list in the outer network. 1186 1187 This should only be called explicitly after the `__init__()` method, just because task embedding as `nn.Module` instance was wiped out at the beginning of it. 1188 """ 1189 for block in self.conv2x: 1190 self.task_embedding_t.update(block.task_embedding_t) 1191 for block in self.conv3x: 1192 self.task_embedding_t.update(block.task_embedding_t) 1193 for block in self.conv4x: 1194 self.task_embedding_t.update(block.task_embedding_t) 1195 for block in self.conv5x: 1196 self.task_embedding_t.update(block.task_embedding_t) 1197 1198 def forward( 1199 self, 1200 input: Tensor, 1201 stage: str, 1202 s_max: float | None = None, 1203 batch_idx: int | None = None, 1204 num_batches: int | None = None, 1205 test_mask: dict[str, Tensor] | None = None, 1206 ) -> tuple[Tensor, dict[str, Tensor], dict[str, Tensor]]: 1207 r"""The forward pass for data from task `task_id`. Task-specific mask for `task_id` are applied to the units which are channels in each weighted convolutional layer. 1208 1209 **Args:** 1210 - **input** (`Tensor`): the input tensor from data. 1211 - **stage** (`str`): the stage of the forward pass, should be one of the following: 1212 1. 'train': training stage. 1213 2. 'validation': validation stage. 1214 3. 'test': testing stage. 1215 - **s_max** (`float` | `None`): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in [HAT paper](http://proceedings.mlr.press/v80/serra18a). 1216 - **batch_idx** (`int` | `None`): the current batch index. Applies only to training stage. For other stages, it is default `None`. 1217 - **num_batches** (`int` | `None`): the total number of batches. Applies only to training stage. For other stages, it is default `None`. 1218 - **test_mask** (`dict[str, Tensor]` | `None`): the binary mask used for test. Applies only to testing stage. For other stages, it is default `None`. 1219 1220 **Returns:** 1221 - **output_feature** (`Tensor`): the output feature tensor to be passed to the heads. 1222 - **mask** (`dict[str, Tensor]`): the mask for the current task. Key (`str`) is layer name, value (`Tensor`) is the mask tensor. The mask tensor has size (number of units). 1223 - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited this `forward()` method of `HAT` class. 1224 """ 1225 batch_size = input.size(0) 1226 hidden_features = {} 1227 1228 # get the mask for the current task from the task embedding in this stage 1229 mask = self.get_mask( 1230 stage=stage, 1231 s_max=s_max, 1232 batch_idx=batch_idx, 1233 num_batches=num_batches, 1234 test_mask=test_mask, 1235 ) 1236 1237 x = input 1238 1239 x = self.conv1(x) 1240 1241 x = x * ( 1242 mask["conv1"].view(1, -1, 1, 1) 1243 ) # apply the mask to the 1st convolutional layer. Broadcast the dimension of mask to match the input 1244 if self.activation: 1245 x = self.conv_activation1(x) 1246 hidden_features["conv1"] = x 1247 1248 x = self.maxpool(x) 1249 1250 for block in self.conv2x: 1251 x, _, hidden_features_block = block( 1252 x, 1253 stage=stage, 1254 s_max=s_max, 1255 batch_idx=batch_idx, 1256 num_batches=num_batches, 1257 test_mask=test_mask, 1258 ) 1259 hidden_features.update(hidden_features_block) # store the hidden feature 1260 for block in self.conv3x: 1261 x, _, hidden_features_block = block( 1262 x, 1263 stage=stage, 1264 s_max=s_max, 1265 batch_idx=batch_idx, 1266 num_batches=num_batches, 1267 test_mask=test_mask, 1268 ) 1269 hidden_features.update(hidden_features_block) # store the hidden feature 1270 for block in self.conv4x: 1271 x, _, hidden_features_block = block( 1272 x, 1273 stage=stage, 1274 s_max=s_max, 1275 batch_idx=batch_idx, 1276 num_batches=num_batches, 1277 test_mask=test_mask, 1278 ) 1279 hidden_features.update(hidden_features_block) # store the hidden feature 1280 for block in self.conv5x: 1281 x, _, hidden_features_block = block( 1282 x, 1283 stage=stage, 1284 s_max=s_max, 1285 batch_idx=batch_idx, 1286 num_batches=num_batches, 1287 test_mask=test_mask, 1288 ) 1289 hidden_features.update(hidden_features_block) # store the hidden feature 1290 1291 x = self.avepool(x) 1292 1293 output_feature = x.view(batch_size, -1) # flatten before going through heads 1294 1295 return output_feature, mask, hidden_features
The base class of HAT masked residual network (ResNet).
HAT (Hard Attention to the Task, 2018) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.
ResNet is a convolutional network architecture, which has 1st convolutional parameter layer and a maxpooling layer, connecting to 4 convolutional layers which contains multiple convolutional parameter layer. Each layer of the 4 are constructed from basic building blocks which are either small (ResNetBlockSmall
) or large (ResNetBlockLarge
). Each building block contains several convolutional parameter layers. The building blocks are connected by a skip connection which is a direct connection from the input of the block to the output of the block, and this is why it's called residual (find "shortcut connections" in the paper for more details). After the 5th convolutional layer, there are average pooling layer and a fully connected layer which connects to the CL output heads.
Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
1065 def __init__( 1066 self, 1067 input_channels: int, 1068 building_block_type: HATMaskResNetBlockSmall | HATMaskResNetBlockLarge, 1069 building_block_nums: tuple[int, int, int, int], 1070 building_block_preceding_output_channels: tuple[int, int, int, int], 1071 building_block_input_channels: tuple[int, int, int, int], 1072 output_dim: int, 1073 gate: str, 1074 activation_layer: nn.Module | None = nn.ReLU, 1075 bias: bool = False, 1076 ) -> None: 1077 r"""Construct and initialise the HAT masked ResNet backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism. 1078 1079 **Args:** 1080 - **input_channels** (`int`): the number of channels of input. Image data are kept channels when going in ResNet. Note that convolutional networks require number of input channels instead of dimension. 1081 - **building_block_type** (`HATMaskResNetBlockSmall` | `HATMaskResNetBlockLarge`): the type of building block used in the ResNet. 1082 - **building_block_nums** (`tuple[int, int, int, int]`): the number of building blocks in the 2-5 convolutional layer correspondingly. 1083 - **building_block_preceding_output_channels** (`tuple[int, int, int, int]`): the number of channels of preceding output of each building block in the 2-5 convolutional layer correspondingly. 1084 - **building_block_input_channels** (`tuple[int, int, int, int]`): the number of channels of input of each building block in the 2-5 convolutional layer correspondingly. 1085 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 1086 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 1087 - `sigmoid`: the sigmoid function. 1088 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 1089 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`, because batch normalisation are doing the similar thing with bias. 1090 """ 1091 # init from both inherited classes 1092 HATMaskBackbone.__init__(self, output_dim=output_dim, gate=gate) 1093 ResNetBase.__init__( 1094 self, 1095 input_channels=input_channels, 1096 building_block_type=building_block_type, 1097 building_block_nums=building_block_nums, 1098 building_block_preceding_output_channels=building_block_preceding_output_channels, 1099 building_block_input_channels=building_block_input_channels, 1100 output_dim=output_dim, 1101 activation_layer=activation_layer, 1102 batch_normalisation=False, # batch normalisation is incompatible with HAT mechanism 1103 bias=bias, 1104 ) 1105 self.register_hat_mask_module_explicitly( 1106 gate=gate 1107 ) # register all `nn.Module`s for HATMaskBackbone explicitly because the second `__init__()` wipes out them inited by the first `__init__()` 1108 self.update_multiple_blocks_task_embedding() 1109 1110 # construct the task embedding over the 1st weighted convolutional layers. It is channel-wise 1111 layer_output_channels = 64 # the output channels of the 1st convolutional layer 1112 self.task_embedding_t["conv1"] = nn.Embedding( 1113 num_embeddings=1, embedding_dim=layer_output_channels 1114 )
Construct and initialise the HAT masked ResNet backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
Args:
- input_channels (
int
): the number of channels of input. Image data are kept channels when going in ResNet. Note that convolutional networks require number of input channels instead of dimension. - building_block_type (
HATMaskResNetBlockSmall
|HATMaskResNetBlockLarge
): the type of building block used in the ResNet. - building_block_nums (
tuple[int, int, int, int]
): the number of building blocks in the 2-5 convolutional layer correspondingly. - building_block_preceding_output_channels (
tuple[int, int, int, int]
): the number of channels of preceding output of each building block in the 2-5 convolutional layer correspondingly. - building_block_input_channels (
tuple[int, int, int, int]
): the number of channels of input of each building block in the 2-5 convolutional layer correspondingly. - output_dim (
int
): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. - gate (
str
): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:sigmoid
: the sigmoid function.
- activation_layer (
nn.Module
): activation function of each layer (if notNone
), ifNone
this layer won't be used. Defaultnn.ReLU
. - bias (
bool
): whether to use bias in the convolutional layer. DefaultFalse
, because batch normalisation are doing the similar thing with bias.
1184 def update_multiple_blocks_task_embedding(self) -> None: 1185 r"""Collect the task embeddings in the multiple building blocks (2-5 convolutional layers) and sync to the weighted layer names list in the outer network. 1186 1187 This should only be called explicitly after the `__init__()` method, just because task embedding as `nn.Module` instance was wiped out at the beginning of it. 1188 """ 1189 for block in self.conv2x: 1190 self.task_embedding_t.update(block.task_embedding_t) 1191 for block in self.conv3x: 1192 self.task_embedding_t.update(block.task_embedding_t) 1193 for block in self.conv4x: 1194 self.task_embedding_t.update(block.task_embedding_t) 1195 for block in self.conv5x: 1196 self.task_embedding_t.update(block.task_embedding_t)
Collect the task embeddings in the multiple building blocks (2-5 convolutional layers) and sync to the weighted layer names list in the outer network.
This should only be called explicitly after the __init__()
method, just because task embedding as nn.Module
instance was wiped out at the beginning of it.
1198 def forward( 1199 self, 1200 input: Tensor, 1201 stage: str, 1202 s_max: float | None = None, 1203 batch_idx: int | None = None, 1204 num_batches: int | None = None, 1205 test_mask: dict[str, Tensor] | None = None, 1206 ) -> tuple[Tensor, dict[str, Tensor], dict[str, Tensor]]: 1207 r"""The forward pass for data from task `task_id`. Task-specific mask for `task_id` are applied to the units which are channels in each weighted convolutional layer. 1208 1209 **Args:** 1210 - **input** (`Tensor`): the input tensor from data. 1211 - **stage** (`str`): the stage of the forward pass, should be one of the following: 1212 1. 'train': training stage. 1213 2. 'validation': validation stage. 1214 3. 'test': testing stage. 1215 - **s_max** (`float` | `None`): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in [HAT paper](http://proceedings.mlr.press/v80/serra18a). 1216 - **batch_idx** (`int` | `None`): the current batch index. Applies only to training stage. For other stages, it is default `None`. 1217 - **num_batches** (`int` | `None`): the total number of batches. Applies only to training stage. For other stages, it is default `None`. 1218 - **test_mask** (`dict[str, Tensor]` | `None`): the binary mask used for test. Applies only to testing stage. For other stages, it is default `None`. 1219 1220 **Returns:** 1221 - **output_feature** (`Tensor`): the output feature tensor to be passed to the heads. 1222 - **mask** (`dict[str, Tensor]`): the mask for the current task. Key (`str`) is layer name, value (`Tensor`) is the mask tensor. The mask tensor has size (number of units). 1223 - **hidden_features** (`dict[str, Tensor]`): the hidden features (after activation) in each weighted layer. Key (`str`) is the weighted layer name, value (`Tensor`) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited this `forward()` method of `HAT` class. 1224 """ 1225 batch_size = input.size(0) 1226 hidden_features = {} 1227 1228 # get the mask for the current task from the task embedding in this stage 1229 mask = self.get_mask( 1230 stage=stage, 1231 s_max=s_max, 1232 batch_idx=batch_idx, 1233 num_batches=num_batches, 1234 test_mask=test_mask, 1235 ) 1236 1237 x = input 1238 1239 x = self.conv1(x) 1240 1241 x = x * ( 1242 mask["conv1"].view(1, -1, 1, 1) 1243 ) # apply the mask to the 1st convolutional layer. Broadcast the dimension of mask to match the input 1244 if self.activation: 1245 x = self.conv_activation1(x) 1246 hidden_features["conv1"] = x 1247 1248 x = self.maxpool(x) 1249 1250 for block in self.conv2x: 1251 x, _, hidden_features_block = block( 1252 x, 1253 stage=stage, 1254 s_max=s_max, 1255 batch_idx=batch_idx, 1256 num_batches=num_batches, 1257 test_mask=test_mask, 1258 ) 1259 hidden_features.update(hidden_features_block) # store the hidden feature 1260 for block in self.conv3x: 1261 x, _, hidden_features_block = block( 1262 x, 1263 stage=stage, 1264 s_max=s_max, 1265 batch_idx=batch_idx, 1266 num_batches=num_batches, 1267 test_mask=test_mask, 1268 ) 1269 hidden_features.update(hidden_features_block) # store the hidden feature 1270 for block in self.conv4x: 1271 x, _, hidden_features_block = block( 1272 x, 1273 stage=stage, 1274 s_max=s_max, 1275 batch_idx=batch_idx, 1276 num_batches=num_batches, 1277 test_mask=test_mask, 1278 ) 1279 hidden_features.update(hidden_features_block) # store the hidden feature 1280 for block in self.conv5x: 1281 x, _, hidden_features_block = block( 1282 x, 1283 stage=stage, 1284 s_max=s_max, 1285 batch_idx=batch_idx, 1286 num_batches=num_batches, 1287 test_mask=test_mask, 1288 ) 1289 hidden_features.update(hidden_features_block) # store the hidden feature 1290 1291 x = self.avepool(x) 1292 1293 output_feature = x.view(batch_size, -1) # flatten before going through heads 1294 1295 return output_feature, mask, hidden_features
The forward pass for data from task task_id
. Task-specific mask for task_id
are applied to the units which are channels in each weighted convolutional layer.
Args:
- input (
Tensor
): the input tensor from data. - stage (
str
): the stage of the forward pass, should be one of the following:- 'train': training stage.
- 'validation': validation stage.
- 'test': testing stage.
- s_max (
float
|None
): the maximum scaling factor in the gate function. Doesn't apply to testing stage. See chapter 2.4 "Hard Attention Training" in HAT paper. - batch_idx (
int
|None
): the current batch index. Applies only to training stage. For other stages, it is defaultNone
. - num_batches (
int
|None
): the total number of batches. Applies only to training stage. For other stages, it is defaultNone
. - test_mask (
dict[str, Tensor]
|None
): the binary mask used for test. Applies only to testing stage. For other stages, it is defaultNone
.
Returns:
- output_feature (
Tensor
): the output feature tensor to be passed to the heads. - mask (
dict[str, Tensor]
): the mask for the current task. Key (str
) is layer name, value (Tensor
) is the mask tensor. The mask tensor has size (number of units). - hidden_features (
dict[str, Tensor]
): the hidden features (after activation) in each weighted layer. Key (str
) is the weighted layer name, value (Tensor
) is the hidden feature tensor. This is used for the continual learning algorithms that need to use the hidden features for various purposes. Although HAT algorithm does not need this, it is still provided for API consistence for other HAT-based algorithms inherited thisforward()
method ofHAT
class.
Inherited Members
1298class HATMaskResNet18(HATMaskResNetBase): 1299 r"""HAT masked ResNet-18 backbone network. 1300 1301 [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters. 1302 1303 ResNet-18 is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 18 weight convolutional layers in total. See Table 1 in the paper for details. 1304 1305 Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function. 1306 """ 1307 1308 def __init__( 1309 self, 1310 input_channels: int, 1311 output_dim: int, 1312 gate: str, 1313 activation_layer: nn.Module | None = nn.ReLU, 1314 bias: bool = False, 1315 ) -> None: 1316 r"""Construct and initialise the ResNet-18 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism. 1317 1318 **Args:** 1319 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 1320 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 1321 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 1322 - `sigmoid`: the sigmoid function. 1323 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 1324 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 1325 """ 1326 HATMaskResNetBase.__init__( 1327 self, 1328 input_channels=input_channels, 1329 building_block_type=HATMaskResNetBlockSmall, # use the smaller building block for ResNet-18 1330 building_block_nums=(2, 2, 2, 2), 1331 building_block_preceding_output_channels=(64, 64, 128, 256), 1332 building_block_input_channels=(64, 128, 256, 512), 1333 output_dim=output_dim, 1334 gate=gate, 1335 activation_layer=activation_layer, 1336 bias=bias, 1337 )
HAT masked ResNet-18 backbone network.
HAT (Hard Attention to the Task, 2018) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.
ResNet-18 is a smaller architecture proposed in the original ResNet paper. It consists of 18 weight convolutional layers in total. See Table 1 in the paper for details.
Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
1308 def __init__( 1309 self, 1310 input_channels: int, 1311 output_dim: int, 1312 gate: str, 1313 activation_layer: nn.Module | None = nn.ReLU, 1314 bias: bool = False, 1315 ) -> None: 1316 r"""Construct and initialise the ResNet-18 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism. 1317 1318 **Args:** 1319 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 1320 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 1321 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 1322 - `sigmoid`: the sigmoid function. 1323 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 1324 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 1325 """ 1326 HATMaskResNetBase.__init__( 1327 self, 1328 input_channels=input_channels, 1329 building_block_type=HATMaskResNetBlockSmall, # use the smaller building block for ResNet-18 1330 building_block_nums=(2, 2, 2, 2), 1331 building_block_preceding_output_channels=(64, 64, 128, 256), 1332 building_block_input_channels=(64, 128, 256, 512), 1333 output_dim=output_dim, 1334 gate=gate, 1335 activation_layer=activation_layer, 1336 bias=bias, 1337 )
Construct and initialise the ResNet-18 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
Args:
- input_channels (
int
): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. - output_dim (
int
): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. - gate (
str
): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:sigmoid
: the sigmoid function.
- activation_layer (
nn.Module
): activation function of each layer (if notNone
), ifNone
this layer won't be used. Defaultnn.ReLU
. - bias (
bool
): whether to use bias in the convolutional layer. DefaultFalse
.
1340class HATMaskResNet34(HATMaskResNetBase): 1341 r"""HAT masked ResNet-34 backbone network. 1342 1343 [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters. 1344 1345 ResNet-34 is a smaller architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 34 weight convolutional layers in total. See Table 1 in the paper for details. 1346 1347 Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function. 1348 """ 1349 1350 def __init__( 1351 self, 1352 input_channels: int, 1353 output_dim: int, 1354 gate: str, 1355 activation_layer: nn.Module | None = nn.ReLU, 1356 bias: bool = False, 1357 ) -> None: 1358 r"""Construct and initialise the ResNet-34 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism. 1359 1360 **Args:** 1361 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 1362 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 1363 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 1364 - `sigmoid`: the sigmoid function. 1365 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 1366 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 1367 """ 1368 HATMaskResNetBase.__init__( 1369 self, 1370 input_channels=input_channels, 1371 building_block_type=HATMaskResNetBlockSmall, # use the smaller building block for ResNet-34 1372 building_block_nums=(3, 4, 6, 3), 1373 building_block_preceding_output_channels=(64, 64, 128, 256), 1374 building_block_input_channels=(64, 128, 256, 512), 1375 output_dim=output_dim, 1376 gate=gate, 1377 activation_layer=activation_layer, 1378 bias=bias, 1379 )
HAT masked ResNet-34 backbone network.
HAT (Hard Attention to the Task, 2018) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.
ResNet-34 is a smaller architecture proposed in the original ResNet paper. It consists of 34 weight convolutional layers in total. See Table 1 in the paper for details.
Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
1350 def __init__( 1351 self, 1352 input_channels: int, 1353 output_dim: int, 1354 gate: str, 1355 activation_layer: nn.Module | None = nn.ReLU, 1356 bias: bool = False, 1357 ) -> None: 1358 r"""Construct and initialise the ResNet-34 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism. 1359 1360 **Args:** 1361 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 1362 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 1363 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 1364 - `sigmoid`: the sigmoid function. 1365 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 1366 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 1367 """ 1368 HATMaskResNetBase.__init__( 1369 self, 1370 input_channels=input_channels, 1371 building_block_type=HATMaskResNetBlockSmall, # use the smaller building block for ResNet-34 1372 building_block_nums=(3, 4, 6, 3), 1373 building_block_preceding_output_channels=(64, 64, 128, 256), 1374 building_block_input_channels=(64, 128, 256, 512), 1375 output_dim=output_dim, 1376 gate=gate, 1377 activation_layer=activation_layer, 1378 bias=bias, 1379 )
Construct and initialise the ResNet-34 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
Args:
- input_channels (
int
): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. - output_dim (
int
): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. - gate (
str
): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:sigmoid
: the sigmoid function.
- activation_layer (
nn.Module
): activation function of each layer (if notNone
), ifNone
this layer won't be used. Defaultnn.ReLU
. - bias (
bool
): whether to use bias in the convolutional layer. DefaultFalse
.
1382class HATMaskResNet50(HATMaskResNetBase): 1383 r"""HAT masked ResNet-50 backbone network. 1384 1385 [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters. 1386 1387 ResNet-50 is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 50 weight convolutional layers in total. See Table 1 in the paper for details. 1388 1389 Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function. 1390 """ 1391 1392 def __init__( 1393 self, 1394 input_channels: int, 1395 output_dim: int, 1396 gate: str, 1397 activation_layer: nn.Module | None = nn.ReLU, 1398 bias: bool = False, 1399 ) -> None: 1400 r"""Construct and initialise the ResNet-50 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism. 1401 1402 **Args:** 1403 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 1404 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 1405 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 1406 - `sigmoid`: the sigmoid function. 1407 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 1408 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 1409 """ 1410 HATMaskResNetBase.__init__( 1411 self, 1412 input_channels=input_channels, 1413 building_block_type=HATMaskResNetBlockLarge, # use the smaller building block for ResNet-50 1414 building_block_nums=(3, 4, 6, 3), 1415 building_block_preceding_output_channels=(64, 256, 512, 1024), 1416 building_block_input_channels=(64, 128, 256, 512), 1417 output_dim=output_dim, 1418 gate=gate, 1419 activation_layer=activation_layer, 1420 bias=bias, 1421 )
HAT masked ResNet-50 backbone network.
HAT (Hard Attention to the Task, 2018) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.
ResNet-50 is a larger architecture proposed in the original ResNet paper. It consists of 50 weight convolutional layers in total. See Table 1 in the paper for details.
Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
1392 def __init__( 1393 self, 1394 input_channels: int, 1395 output_dim: int, 1396 gate: str, 1397 activation_layer: nn.Module | None = nn.ReLU, 1398 bias: bool = False, 1399 ) -> None: 1400 r"""Construct and initialise the ResNet-50 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism. 1401 1402 **Args:** 1403 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 1404 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 1405 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 1406 - `sigmoid`: the sigmoid function. 1407 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 1408 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 1409 """ 1410 HATMaskResNetBase.__init__( 1411 self, 1412 input_channels=input_channels, 1413 building_block_type=HATMaskResNetBlockLarge, # use the smaller building block for ResNet-50 1414 building_block_nums=(3, 4, 6, 3), 1415 building_block_preceding_output_channels=(64, 256, 512, 1024), 1416 building_block_input_channels=(64, 128, 256, 512), 1417 output_dim=output_dim, 1418 gate=gate, 1419 activation_layer=activation_layer, 1420 bias=bias, 1421 )
Construct and initialise the ResNet-50 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
Args:
- input_channels (
int
): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. - output_dim (
int
): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. - gate (
str
): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:sigmoid
: the sigmoid function.
- activation_layer (
nn.Module
): activation function of each layer (if notNone
), ifNone
this layer won't be used. Defaultnn.ReLU
. - bias (
bool
): whether to use bias in the convolutional layer. DefaultFalse
.
1424class HATMaskResNet101(HATMaskResNetBase): 1425 r"""HAT masked ResNet-101 backbone network. 1426 1427 [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters. 1428 1429 ResNet-101 is a larger architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 101 weight convolutional layers in total. See Table 1 in the paper for details. 1430 1431 Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function. 1432 """ 1433 1434 def __init__( 1435 self, 1436 input_channels: int, 1437 output_dim: int, 1438 gate: str, 1439 activation_layer: nn.Module | None = nn.ReLU, 1440 bias: bool = False, 1441 ) -> None: 1442 r"""Construct and initialise the ResNet-101 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism. 1443 1444 **Args:** 1445 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 1446 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 1447 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 1448 - `sigmoid`: the sigmoid function. 1449 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 1450 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 1451 """ 1452 HATMaskResNetBase.__init__( 1453 self, 1454 input_channels=input_channels, 1455 building_block_type=HATMaskResNetBlockLarge, # use the smaller building block for ResNet-18 1456 building_block_nums=(3, 4, 23, 3), 1457 building_block_preceding_output_channels=(64, 256, 512, 1024), 1458 building_block_input_channels=(64, 128, 256, 512), 1459 output_dim=output_dim, 1460 gate=gate, 1461 activation_layer=activation_layer, 1462 bias=bias, 1463 )
HAT masked ResNet-101 backbone network.
HAT (Hard Attention to the Task, 2018) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.
ResNet-101 is a larger architecture proposed in the original ResNet paper. It consists of 101 weight convolutional layers in total. See Table 1 in the paper for details.
Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
1434 def __init__( 1435 self, 1436 input_channels: int, 1437 output_dim: int, 1438 gate: str, 1439 activation_layer: nn.Module | None = nn.ReLU, 1440 bias: bool = False, 1441 ) -> None: 1442 r"""Construct and initialise the ResNet-101 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism. 1443 1444 **Args:** 1445 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 1446 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 1447 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 1448 - `sigmoid`: the sigmoid function. 1449 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 1450 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 1451 """ 1452 HATMaskResNetBase.__init__( 1453 self, 1454 input_channels=input_channels, 1455 building_block_type=HATMaskResNetBlockLarge, # use the smaller building block for ResNet-18 1456 building_block_nums=(3, 4, 23, 3), 1457 building_block_preceding_output_channels=(64, 256, 512, 1024), 1458 building_block_input_channels=(64, 128, 256, 512), 1459 output_dim=output_dim, 1460 gate=gate, 1461 activation_layer=activation_layer, 1462 bias=bias, 1463 )
Construct and initialise the ResNet-101 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
Args:
- input_channels (
int
): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. - output_dim (
int
): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. - gate (
str
): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:sigmoid
: the sigmoid function.
- activation_layer (
nn.Module
): activation function of each layer (if notNone
), ifNone
this layer won't be used. Defaultnn.ReLU
. - bias (
bool
): whether to use bias in the convolutional layer. DefaultFalse
.
1466class HATMaskResNet152(HATMaskResNetBase): 1467 r"""HAT masked ResNet-152 backbone network. 1468 1469 [HAT (Hard Attention to the Task, 2018)](http://proceedings.mlr.press/v80/serra18a) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters. 1470 1471 ResNet-152 is the largest architecture proposed in the [original ResNet paper](https://www.cv-foundation.org/openaccess/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html). It consists of 152 weight convolutional layers in total. See Table 1 in the paper for details. 1472 1473 Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function. 1474 """ 1475 1476 def __init__( 1477 self, 1478 input_channels: int, 1479 output_dim: int, 1480 gate: str, 1481 activation_layer: nn.Module | None = nn.ReLU, 1482 bias: bool = False, 1483 ) -> None: 1484 r"""Construct and initialise the ResNet-152 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism. 1485 1486 **Args:** 1487 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 1488 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 1489 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 1490 - `sigmoid`: the sigmoid function. 1491 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 1492 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 1493 """ 1494 HATMaskResNetBase.__init__( 1495 self, 1496 input_channels=input_channels, 1497 building_block_type=HATMaskResNetBlockLarge, # use the smaller building block for ResNet-152 1498 building_block_nums=(3, 8, 36, 3), 1499 building_block_preceding_output_channels=(64, 256, 512, 1024), 1500 building_block_input_channels=(64, 128, 256, 512), 1501 output_dim=output_dim, 1502 gate=gate, 1503 activation_layer=activation_layer, 1504 bias=bias, 1505 )
HAT masked ResNet-152 backbone network.
HAT (Hard Attention to the Task, 2018) is an architecture-based continual learning approach that uses learnable hard attention masks to select the task-specific parameters.
ResNet-152 is the largest architecture proposed in the original ResNet paper. It consists of 152 weight convolutional layers in total. See Table 1 in the paper for details.
Mask is applied to the units which are output channels in each weighted convolutional layer. The mask is generated from the unit-wise task embedding and gate function.
1476 def __init__( 1477 self, 1478 input_channels: int, 1479 output_dim: int, 1480 gate: str, 1481 activation_layer: nn.Module | None = nn.ReLU, 1482 bias: bool = False, 1483 ) -> None: 1484 r"""Construct and initialise the ResNet-152 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism. 1485 1486 **Args:** 1487 - **input_channels** (`int`): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. 1488 - **output_dim** (`int`): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. 1489 - **gate** (`str`): the type of gate function turning the real value task embeddings into attention masks, should be one of the following: 1490 - `sigmoid`: the sigmoid function. 1491 - **activation_layer** (`nn.Module`): activation function of each layer (if not `None`), if `None` this layer won't be used. Default `nn.ReLU`. 1492 - **bias** (`bool`): whether to use bias in the convolutional layer. Default `False`. 1493 """ 1494 HATMaskResNetBase.__init__( 1495 self, 1496 input_channels=input_channels, 1497 building_block_type=HATMaskResNetBlockLarge, # use the smaller building block for ResNet-152 1498 building_block_nums=(3, 8, 36, 3), 1499 building_block_preceding_output_channels=(64, 256, 512, 1024), 1500 building_block_input_channels=(64, 128, 256, 512), 1501 output_dim=output_dim, 1502 gate=gate, 1503 activation_layer=activation_layer, 1504 bias=bias, 1505 )
Construct and initialise the ResNet-152 backbone network with task embedding. Note that batch normalisation is incompatible with HAT mechanism.
Args:
- input_channels (
int
): the number of channels of input of this building block. Note that convolutional networks require number of input channels instead of dimension. - output_dim (
int
): the output dimension after flattening at last which connects to CL output heads. Although this is not determined by us but the architecture built before the flattening layer, we still need to provide this to construct the heads. - gate (
str
): the type of gate function turning the real value task embeddings into attention masks, should be one of the following:sigmoid
: the sigmoid function.
- activation_layer (
nn.Module
): activation function of each layer (if notNone
), ifNone
this layer won't be used. Defaultnn.ReLU
. - bias (
bool
): whether to use bias in the convolutional layer. DefaultFalse
.