# Copyright (C) 2020-2023, François-Guillaume Fernandez.# This program is licensed under the Apache License 2.0.# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0> for full license details.importloggingimportmathfromtypingimportAny,List,Optional,Tuple,Unionimporttorchimporttorch.nn.functionalasFfromtorchimportTensor,nnfrom._utilsimportlocate_linear_layerfrom.coreimport_CAM__all__=["CAM","ScoreCAM","SSCAM","ISCAM"]
[docs]classCAM(_CAM):r"""Implements a class activation map extractor as described in `"Learning Deep Features for Discriminative Localization" <https://arxiv.org/pdf/1512.04150.pdf>`_. The Class Activation Map (CAM) is defined for image classification models that have global pooling at the end of the visual feature extraction block. The localization map is computed as follows: .. math:: L^{(c)}_{CAM}(x, y) = ReLU\Big(\sum\limits_k w_k^{(c)} A_k(x, y)\Big) where :math:`A_k(x, y)` is the activation of node :math:`k` in the target layer of the model at position :math:`(x, y)`, and :math:`w_k^{(c)}` is the weight corresponding to class :math:`c` for unit :math:`k` in the fully connected layer.. >>> from torchvision.models import resnet18 >>> from torchcam.methods import CAM >>> model = resnet18(pretrained=True).eval() >>> cam = CAM(model, 'layer4', 'fc') >>> with torch.no_grad(): out = model(input_tensor) >>> cam(class_idx=100) Args: model: input model target_layer: either the target layer itself or its name, or a list of those fc_layer: either the fully connected layer itself or its name input_shape: shape of the expected input tensor excluding the batch dimension """def__init__(self,model:nn.Module,target_layer:Optional[Union[Union[nn.Module,str],List[Union[nn.Module,str]]]]=None,fc_layer:Optional[Union[nn.Module,str]]=None,input_shape:Tuple[int,...]=(3,224,224),**kwargs:Any,)->None:ifisinstance(target_layer,list)andlen(target_layer)>1:raiseValueError("base CAM does not support multiple target layers")super().__init__(model,target_layer,input_shape,**kwargs)ifisinstance(fc_layer,str):fc_name=fc_layer# Find the location of the moduleelifisinstance(fc_layer,nn.Module):fc_name=self._resolve_layer_name(fc_layer)# If the layer is not specified, try automatic resolutioneliffc_layerisNone:fc_name=locate_linear_layer(model)# type: ignore[assignment]# Warn the user of the choiceifisinstance(fc_name,str):logging.warning(f"no value was provided for `fc_layer`, thus set to '{fc_name}'.")else:raiseValueError("unable to resolve `fc_layer` automatically, please specify its value.")else:raiseTypeError("invalid argument type for `fc_layer`")# Softmax weightself._fc_weights=self.submodule_dict[fc_name].weight.data# squeeze to accomodate replacement by Conv1x1ifself._fc_weights.ndim>2:self._fc_weights=self._fc_weights.view(*self._fc_weights.shape[:2])@torch.no_grad()def_get_weights(# type: ignore[override]self,class_idx:Union[int,List[int]],*args:Any,)->List[Tensor]:"""Computes the weight coefficients of the hooked activation maps."""# Take the FC weights of the target classifisinstance(class_idx,int):return[self._fc_weights[class_idx,:].unsqueeze(0)]else:return[self._fc_weights[class_idx,:]]
[docs]classScoreCAM(_CAM):r"""Implements a class activation map extractor as described in `"Score-CAM: Score-Weighted Visual Explanations for Convolutional Neural Networks" <https://arxiv.org/pdf/1910.01279.pdf>`_. The localization map is computed as follows: .. math:: L^{(c)}_{Score-CAM}(x, y) = ReLU\Big(\sum\limits_k w_k^{(c)} A_k(x, y)\Big) with the coefficient :math:`w_k^{(c)}` being defined as: .. math:: w_k^{(c)} = softmax\Big(Y^{(c)}(M_k) - Y^{(c)}(X_b)\Big)_k where :math:`A_k(x, y)` is the activation of node :math:`k` in the target layer of the model at position :math:`(x, y)`, :math:`Y^{(c)}(X)` is the model output score for class :math:`c` before softmax for input :math:`X`, :math:`X_b` is a baseline image, and :math:`M_k` is defined as follows: .. math:: M_k = \frac{U(A_k) - \min\limits_m U(A_m)}{\max\limits_m U(A_m) - \min\limits_m U(A_m)}) \odot X_b where :math:`\odot` refers to the element-wise multiplication and :math:`U` is the upsampling operation. >>> from torchvision.models import resnet18 >>> from torchcam.methods import ScoreCAM >>> model = resnet18(pretrained=True).eval() >>> cam = ScoreCAM(model, 'layer4') >>> with torch.no_grad(): out = model(input_tensor) >>> cam(class_idx=100) Args: model: input model target_layer: either the target layer itself or its name, or a list of those batch_size: batch size used to forward masked inputs input_shape: shape of the expected input tensor excluding the batch dimension """def__init__(self,model:nn.Module,target_layer:Optional[Union[Union[nn.Module,str],List[Union[nn.Module,str]]]]=None,batch_size:int=32,input_shape:Tuple[int,...]=(3,224,224),**kwargs:Any,)->None:super().__init__(model,target_layer,input_shape,**kwargs)# Input hookself.hook_handles.append(model.register_forward_pre_hook(self._store_input))# type: ignore[arg-type]self.bs=batch_size# Ensure ReLU is applied to CAM before normalizationself._relu=Truedef_store_input(self,module:nn.Module,input:Tensor)->None:"""Store model input tensor."""ifself._hooks_enabled:self._input=input[0].data.clone()@torch.no_grad()def_get_score_weights(self,activations:List[Tensor],class_idx:Union[int,List[int]])->List[Tensor]:b,c=activations[0].shape[:2]# (N * C, I, H, W)scored_inputs=[(act.unsqueeze(2)*self._input.unsqueeze(1)).view(b*c,*self._input.shape[1:])foractinactivations]# Initialize weights# (N * C)weights=[torch.zeros(b*c,dtype=t.dtype).to(device=t.device)fortinactivations]# (N, M)logits=self.model(self._input)idcs=torch.arange(b).repeat_interleave(c)foridx,scored_inputinenumerate(scored_inputs):# Process by chunk (GPU RAM limitation)for_idxinrange(math.ceil(weights[idx].numel()/self.bs)):_slice=slice(_idx*self.bs,min((_idx+1)*self.bs,weights[idx].numel()))# Get the softmax probabilities of the target class# (*, M)cic=self.model(scored_input[_slice])-logits[idcs[_slice]]ifisinstance(class_idx,int):weights[idx][_slice]=cic[:,class_idx]else:_target=torch.tensor(class_idx,device=cic.device)[idcs[_slice]]weights[idx][_slice]=cic.gather(1,_target.view(-1,1)).squeeze(1)# Reshape the weights (N, C)return[torch.softmax(w.view(b,c),-1)forwinweights]@torch.no_grad()def_get_weights(# type: ignore[override]self,class_idx:Union[int,List[int]],*args:Any,)->List[Tensor]:"""Computes the weight coefficients of the hooked activation maps."""self.hook_a:List[Tensor]# type: ignore[assignment]# Normalize the activation# (N, C, H', W')upsampled_a=[self._normalize(act.clone(),act.ndim-2)foractinself.hook_a]# Upsample it to input_size# (N, C, H, W)spatial_dims=self._input.ndim-2interpolation_mode="bilinear"ifspatial_dims==2else"trilinear"ifspatial_dims==3else"nearest"upsampled_a=[F.interpolate(up_a,self._input.shape[2:],mode=interpolation_mode,align_corners=False)forup_ainupsampled_a]# Disable hook updatesself._hooks_enabled=False# Switch to evalorigin_mode=self.model.trainingself.model.eval()weights:List[Tensor]=self._get_score_weights(upsampled_a,class_idx)# Reenable hook updatesself._hooks_enabled=True# Put back the model in the correct modeself.model.training=origin_modereturnweightsdef__repr__(self)->str:returnf"{self.__class__.__name__}(batch_size={self.bs})"
[docs]classSSCAM(ScoreCAM):r"""Implements a class activation map extractor as described in `"SS-CAM: Smoothed Score-CAM for Sharper Visual Feature Localization" <https://arxiv.org/pdf/2006.14255.pdf>`_. The localization map is computed as follows: .. math:: L^{(c)}_{SS-CAM}(x, y) = ReLU\Big(\sum\limits_k w_k^{(c)} A_k(x, y)\Big) with the coefficient :math:`w_k^{(c)}` being defined as: .. math:: w_k^{(c)} = softmax\Big(\frac{1}{N} \sum\limits_{i=1}^N (Y^{(c)}(\hat{M_k}) - Y^{(c)}(X_b))\Big)_k where :math:`N` is the number of samples used to smooth the weights, :math:`A_k(x, y)` is the activation of node :math:`k` in the target layer of the model at position :math:`(x, y)`, :math:`Y^{(c)}(X)` is the model output score for class :math:`c` before softmax for input :math:`X`, :math:`X_b` is a baseline image, and :math:`M_k` is defined as follows: .. math:: \hat{M_k} = \Bigg(\frac{U(A_k) - \min\limits_m U(A_m)}{\max\limits_m U(A_m) - \min\limits_m U(A_m)} + \delta\Bigg) \odot X_b where :math:`\odot` refers to the element-wise multiplication, :math:`U` is the upsampling operation, :math:`\delta \sim \mathcal{N}(0, \sigma^2)` is the random noise that follows a 0-mean gaussian distribution with a standard deviation of :math:`\sigma`. >>> from torchvision.models import resnet18 >>> from torchcam.methods import SSCAM >>> model = resnet18(pretrained=True).eval() >>> cam = SSCAM(model, 'layer4') >>> with torch.no_grad(): out = model(input_tensor) >>> cam(class_idx=100) Args: model: input model target_layer: either the target layer itself or its name, or a list of those batch_size: batch size used to forward masked inputs num_samples: number of noisy samples used for weight computation std: standard deviation of the noise added to the normalized activation input_shape: shape of the expected input tensor excluding the batch dimension """def__init__(self,model:nn.Module,target_layer:Optional[Union[Union[nn.Module,str],List[Union[nn.Module,str]]]]=None,batch_size:int=32,num_samples:int=35,std:float=2.0,input_shape:Tuple[int,...]=(3,224,224),**kwargs:Any,)->None:super().__init__(model,target_layer,batch_size,input_shape,**kwargs)self.num_samples=num_samplesself.std=stdself._distrib=torch.distributions.normal.Normal(0,self.std)@torch.no_grad()def_get_score_weights(self,activations:List[Tensor],class_idx:Union[int,List[int]])->List[Tensor]:b,c=activations[0].shape[:2]# Initialize weights# (N * C)weights=[torch.zeros(b*c,dtype=t.dtype).to(device=t.device)fortinactivations]# (N, M)logits=self.model(self._input)idcs=torch.arange(b).repeat_interleave(c)foridx,actinenumerate(activations):# Add noisefor_inrange(self.num_samples):noise=self._distrib.sample(act.size()).to(device=act.device)# (N, C, I, H, W)scored_input=(act+noise).unsqueeze(2)*self._input.unsqueeze(1)# (N * C, I, H, W)scored_input=scored_input.view(b*c,*scored_input.shape[2:])# Process by chunk (GPU RAM limitation)for_idxinrange(math.ceil(weights[idx].numel()/self.bs)):_slice=slice(_idx*self.bs,min((_idx+1)*self.bs,weights[idx].numel()))# Get the softmax probabilities of the target classcic=self.model(scored_input[_slice])-logits[idcs[_slice]]ifisinstance(class_idx,int):weights[idx][_slice]+=cic[:,class_idx]else:_target=torch.tensor(class_idx,device=cic.device)[idcs[_slice]]weights[idx][_slice]+=cic.gather(1,_target.view(-1,1)).squeeze(1)# Reshape the weights (N, C)return[torch.softmax(weight.div_(self.num_samples).view(b,c),-1)forweightinweights]def__repr__(self)->str:returnf"{self.__class__.__name__}(batch_size={self.bs}, num_samples={self.num_samples}, std={self.std})"
[docs]classISCAM(ScoreCAM):r"""Implements a class activation map extractor as described in `"IS-CAM: Integrated Score-CAM for axiomatic-based explanations" <https://arxiv.org/pdf/2010.03023.pdf>`_. The localization map is computed as follows: .. math:: L^{(c)}_{ISS-CAM}(x, y) = ReLU\Big(\sum\limits_k w_k^{(c)} A_k(x, y)\Big) with the coefficient :math:`w_k^{(c)}` being defined as: .. math:: w_k^{(c)} = softmax\Bigg(\frac{1}{N} \sum\limits_{i=1}^N \Big(Y^{(c)}(M_i) - Y^{(c)}(X_b)\Big)\Bigg)_k where :math:`N` is the number of samples used to smooth the weights, :math:`A_k(x, y)` is the activation of node :math:`k` in the target layer of the model at position :math:`(x, y)`, :math:`Y^{(c)}(X)` is the model output score for class :math:`c` before softmax for input :math:`X`, :math:`X_b` is a baseline image, and :math:`M_i` is defined as follows: .. math:: M_i = \sum\limits_{j=0}^{i-1} \frac{j}{N} \frac{U(A_k) - \min\limits_m U(A_m)}{\max\limits_m U(A_m) - \min\limits_m U(A_m)} \odot X_b where :math:`\odot` refers to the element-wise multiplication, :math:`U` is the upsampling operation. >>> from torchvision.models import resnet18 >>> from torchcam.methods import ISSCAM >>> model = resnet18(pretrained=True).eval() >>> cam = ISCAM(model, 'layer4') >>> with torch.no_grad(): out = model(input_tensor) >>> cam(class_idx=100) Args: model: input model target_layer: either the target layer itself or its name, or a list of those batch_size: batch size used to forward masked inputs num_samples: number of noisy samples used for weight computation input_shape: shape of the expected input tensor excluding the batch dimension """def__init__(self,model:nn.Module,target_layer:Optional[Union[Union[nn.Module,str],List[Union[nn.Module,str]]]]=None,batch_size:int=32,num_samples:int=10,input_shape:Tuple[int,...]=(3,224,224),**kwargs:Any,)->None:super().__init__(model,target_layer,batch_size,input_shape,**kwargs)self.num_samples=num_samples@torch.no_grad()def_get_score_weights(self,activations:List[Tensor],class_idx:Union[int,List[int]])->List[Tensor]:b,c=activations[0].shape[:2]# (N * C, I, H, W)scored_inputs=[(act.unsqueeze(2)*self._input.unsqueeze(1)).view(b*c,*self._input.shape[1:])foractinactivations]# Initialize weightsweights=[torch.zeros(b*c,dtype=t.dtype).to(device=t.device)fortinactivations]# (N, M)logits=self.model(self._input)idcs=torch.arange(b).repeat_interleave(c)foridx,scored_inputinenumerate(scored_inputs):_coeff=0.0# Process by chunk (GPU RAM limitation)forsidxinrange(self.num_samples):_coeff+=(sidx+1)/self.num_samples# Process by chunk (GPU RAM limitation)for_idxinrange(math.ceil(weights[idx].numel()/self.bs)):_slice=slice(_idx*self.bs,min((_idx+1)*self.bs,weights[idx].numel()))# Get the softmax probabilities of the target classcic=self.model(_coeff*scored_input[_slice])-logits[idcs[_slice]]ifisinstance(class_idx,int):weights[idx][_slice]+=cic[:,class_idx]else:_target=torch.tensor(class_idx,device=cic.device)[idcs[_slice]]weights[idx][_slice]+=cic.gather(1,_target.view(-1,1)).squeeze(1)# Reshape the weights (N, C)return[torch.softmax(weight.div_(self.num_samples).view(b,c),-1)forweightinweights]