CNN Kernels and Feature Maps
What learned kernels detect, depthwise separable convolutions, dilated convolutions, and visualising what a CNN has learned.
What Kernels Learn
A 3×3 kernel slides over the feature map, computing dot products.
The dot product is high when the input patch resembles the kernel.
Classic hand-crafted kernels:
Edge detection (Sobel vertical): [-1, 0, 1; -1, 0, 1; -1, 0, 1]
Edge detection (Sobel horizontal): [-1,-1,-1; 0, 0, 0; 1, 1, 1]
Blur (Gaussian): [1, 2, 1; 2, 4, 2; 1, 2, 1] / 16
Sharpen: [0,-1, 0; -1, 5,-1; 0,-1, 0]
Learned kernels:
Layer 1: edge detectors, colour blobs (similar to hand-crafted)
Layer 2: corners, textures, T-junctions
Layer 3: complex textures, object parts
Layer 4+: semantic concepts
CNNs rediscover classical computer vision primitives — a result of
training on images, not a design choice.Applying Kernels Manually
import torch
import torch.nn as nn
import torch.nn.functional as F
# Define classic kernels
sobel_x = torch.tensor([
[-1., 0., 1.],
[-2., 0., 2.],
[-1., 0., 1.],
]).view(1, 1, 3, 3)
sobel_y = torch.tensor([
[-1., -2., -1.],
[ 0., 0., 0.],
[ 1., 2., 1.],
]).view(1, 1, 3, 3)
gaussian = torch.tensor([
[1., 2., 1.],
[2., 4., 2.],
[1., 2., 1.],
]).view(1, 1, 3, 3) / 16.0
def apply_kernel(image: torch.Tensor, kernel: torch.Tensor) -> torch.Tensor:
"""Apply a 2D kernel to a grayscale image tensor."""
# image: (1, 1, H, W), kernel: (1, 1, k, k)
return F.conv2d(image, kernel, padding=1)
# Simulate a synthetic grayscale image (e.g., chest X-ray)
H, W = 64, 64
image = torch.zeros(1, 1, H, W)
image[0, 0, 20:40, 10:55] = 1.0 # white rectangle (rib-like structure)
edge_x = apply_kernel(image, sobel_x)
edge_y = apply_kernel(image, sobel_y)
blurred = apply_kernel(image, gaussian)
print(f"Original: min={image.min():.2f}, max={image.max():.2f}")
print(f"Edge X: min={edge_x.min():.2f}, max={edge_x.max():.2f}")
print(f"Edge Y: min={edge_y.min():.2f}, max={edge_y.max():.2f}")
print(f"Blurred: min={blurred.min():.2f}, max={blurred.max():.2f}")
# CNN learns these and more complex patterns from dataDepthwise Separable Convolution
import torch
import torch.nn as nn
# Standard conv: in_ch × out_ch × k × k parameters
# Depthwise separable: splits into:
# 1. Depthwise conv: in_ch × 1 × k × k (one filter per input channel)
# 2. Pointwise conv: out_ch × in_ch × 1 × 1 (mixes channels)
#
# Parameter ratio: (in_ch × k² + out_ch × in_ch) / (out_ch × in_ch × k²)
# ≈ 1/out_ch + 1/k² ≈ 0.09 for k=3, out_ch=64 (9× fewer params)
#
# Used in: MobileNet, Xception, EfficientNet, many mobile architectures
class DepthwiseSeparableConv(nn.Module):
"""Factorised convolution: spatial (depthwise) + channel-mixing (pointwise)."""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int = 3,
stride: int = 1,
padding: int = 1,
):
super().__init__()
self.depthwise = nn.Sequential(
nn.Conv2d(
in_channels, in_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=in_channels, # key: groups=in_channels for depthwise
bias=False,
),
nn.BatchNorm2d(in_channels),
nn.ReLU6(), # ReLU6 is standard in MobileNet
)
self.pointwise = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
nn.BatchNorm2d(out_channels),
nn.ReLU6(),
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.pointwise(self.depthwise(x))
# Parameter comparison
standard_conv = nn.Conv2d(64, 128, kernel_size=3, padding=1)
depthwise_sep = DepthwiseSeparableConv(64, 128)
n_standard = sum(p.numel() for p in standard_conv.parameters())
n_dws = sum(p.numel() for p in depthwise_sep.parameters())
print(f"Standard conv: {n_standard:,} parameters")
print(f"Depthwise sep: {n_dws:,} parameters")
print(f"Reduction: {n_standard/n_dws:.1f}x")
x = torch.randn(8, 64, 56, 56)
out = depthwise_sep(x)
print(f"Output shape: {out.shape}") # (8, 128, 56, 56)Dilated (Atrous) Convolution
import torch
import torch.nn as nn
# Dilated conv: inserts gaps between kernel elements
# dilation=1: standard 3×3 kernel (no gaps)
# dilation=2: 3×3 kernel with gaps → effective receptive field = 5×5
# dilation=4: effective receptive field = 9×9
#
# Advantage: large receptive field without more parameters or downsampling
# Used in: DeepLab (semantic segmentation), WaveNet (audio), temporal models
class DilatedConvBlock(nn.Module):
def __init__(self, channels: int, dilation: int):
super().__init__()
# padding = dilation to maintain spatial size
self.conv = nn.Sequential(
nn.Conv2d(channels, channels, kernel_size=3, padding=dilation, dilation=dilation, bias=False),
nn.BatchNorm2d(channels),
nn.ReLU(),
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.conv(x)
# ASPP: Atrous Spatial Pyramid Pooling (DeepLab)
# Multiple dilated convs with different dilation rates in parallel
class ASPP(nn.Module):
"""Parallel dilated convolutions at multiple scales."""
def __init__(self, in_channels: int, out_channels: int = 256):
super().__init__()
self.convs = nn.ModuleList([
nn.Sequential(nn.Conv2d(in_channels, out_channels, 1, bias=False), nn.BatchNorm2d(out_channels), nn.ReLU()),
DilatedConvBlock(in_channels, dilation=6),
DilatedConvBlock(in_channels, dilation=12),
DilatedConvBlock(in_channels, dilation=18),
])
self.project = nn.Conv2d(4 * in_channels, out_channels, kernel_size=1)
def forward(self, x: torch.Tensor) -> torch.Tensor:
outs = [conv(x) for conv in self.convs]
return self.project(torch.cat(outs, dim=1))
x = torch.randn(4, 256, 32, 32)
aspp = ASPP(in_channels=256, out_channels=256)
out = aspp(x)
print(f"ASPP output: {out.shape}") # (4, 256, 32, 32)Feature Map Visualisation
import torch
import torch.nn as nn
def visualise_feature_maps(
model: nn.Module,
X: torch.Tensor,
layer_name: str,
) -> torch.Tensor:
"""Extract and return feature maps from a specific layer."""
feature_maps = {}
def hook(module, input, output):
feature_maps[layer_name] = output.detach()
# Find the layer by name
for name, module in model.named_modules():
if name == layer_name:
handle = module.register_forward_hook(hook)
break
with torch.no_grad():
_ = model(X)
handle.remove()
return feature_maps.get(layer_name)
# Activation maximisation: find input that maximises a neuron's activation
def maximise_activation(
model: nn.Module,
layer_name: str,
neuron_idx: int,
n_steps: int = 200,
lr: float = 0.1,
) -> torch.Tensor:
"""Create an input that maximally activates a specific neuron."""
# Start from random noise
x = torch.randn(1, 3, 64, 64, requires_grad=True)
optimizer = torch.optim.Adam([x], lr=lr)
# Hook to capture target activation
target_activation = {}
def hook(module, input, output):
target_activation["val"] = output[0, neuron_idx] # first sample, specific neuron
for name, module in model.named_modules():
if name == layer_name:
handle = module.register_forward_hook(hook)
break
for step in range(n_steps):
optimizer.zero_grad()
_ = model(x)
loss = -target_activation["val"].mean() # maximise activation
loss.backward(retain_graph=True)
optimizer.step()
handle.remove()
return x.detach()
# The resulting x shows what pattern the neuron is "looking for"Interview Answer
"CNN kernels are small learned weight matrices (3×3 is standard) that detect spatial patterns. Shallow layers learn edges and colour blobs; deeper layers learn textures, object parts, and semantic concepts. Depthwise separable convolution (used in MobileNet) factorises a standard conv into two cheaper steps: depthwise (one filter per input channel, detecting spatial patterns) and pointwise (1×1 conv mixing channels) — roughly 8–9× fewer parameters for a 3×3 conv with many channels. Dilated convolution inserts gaps between kernel elements, expanding the effective receptive field without increasing parameters or downsampling — critical for segmentation tasks where spatial resolution must be maintained. Feature map visualisation techniques (activation maximisation, gradient visualisation) show that CNN kernels genuinely rediscover classical edge detectors in the first layer — an emergent property of training on natural images."
Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.