Deep Learning for AI Interviews · Lesson 46 of 56
Object Detection: YOLO and Faster R-CNN Concepts
Classification vs Detection vs Segmentation
Image Classification:
Input: image
Output: class label (and probability)
Example: "this is a chest X-ray showing pneumonia"
Object Detection:
Input: image
Output: list of (class, bounding box, confidence) for all objects
Example: [(nodule, [x1=120, y1=80, x2=180, y2=140], 0.93)]
Semantic Segmentation:
Input: image
Output: class label per pixel
Example: each pixel labelled as {background, lung, heart, nodule}
Instance Segmentation:
Input: image
Output: per-pixel class + instance ID (distinguishes individual objects)
Example: nodule_1 and nodule_2 as separate masksBounding Box Representation
import torch
# Bounding box formats:
# XYXY: [x_min, y_min, x_max, y_max] — corners
# XYWH: [x_center, y_center, width, height] — center + size (YOLO format)
# CXCYWH same as XYWH with explicit naming
def xyxy_to_xywh(boxes: torch.Tensor) -> torch.Tensor:
"""Convert [x1, y1, x2, y2] to [cx, cy, w, h]."""
x1, y1, x2, y2 = boxes.unbind(dim=-1)
return torch.stack([
(x1 + x2) / 2, # cx
(y1 + y2) / 2, # cy
x2 - x1, # w
y2 - y1, # h
], dim=-1)
def xywh_to_xyxy(boxes: torch.Tensor) -> torch.Tensor:
"""Convert [cx, cy, w, h] to [x1, y1, x2, y2]."""
cx, cy, w, h = boxes.unbind(dim=-1)
return torch.stack([cx - w/2, cy - h/2, cx + w/2, cy + h/2], dim=-1)
# IoU: Intersection over Union — measures bounding box overlap
def box_iou(boxes_a: torch.Tensor, boxes_b: torch.Tensor) -> torch.Tensor:
"""Compute IoU for pairs of boxes (XYXY format)."""
# Intersection
inter_x1 = torch.max(boxes_a[:, 0], boxes_b[:, 0])
inter_y1 = torch.max(boxes_a[:, 1], boxes_b[:, 1])
inter_x2 = torch.min(boxes_a[:, 2], boxes_b[:, 2])
inter_y2 = torch.min(boxes_a[:, 3], boxes_b[:, 3])
inter_area = (inter_x2 - inter_x1).clamp(0) * (inter_y2 - inter_y1).clamp(0)
# Union
area_a = (boxes_a[:, 2] - boxes_a[:, 0]) * (boxes_a[:, 3] - boxes_a[:, 1])
area_b = (boxes_b[:, 2] - boxes_b[:, 0]) * (boxes_b[:, 3] - boxes_b[:, 1])
union_area = area_a + area_b - inter_area
return inter_area / (union_area + 1e-6)
# Example
pred_box = torch.tensor([[10., 20., 50., 80.]]) # predicted box
true_box = torch.tensor([[15., 25., 55., 85.]]) # ground truth
iou = box_iou(pred_box, true_box)
print(f"IoU: {iou.item():.4f}") # ~0.7 for mostly overlapping boxesNon-Maximum Suppression
import torch
def non_maximum_suppression(
boxes: torch.Tensor, # (N, 4) XYXY format
scores: torch.Tensor, # (N,) confidence scores
iou_threshold: float = 0.5,
) -> torch.Tensor:
"""
Remove duplicate detections by keeping the highest-confidence box
and suppressing overlapping boxes with IoU > threshold.
Returns indices of kept boxes.
"""
# Sort by confidence descending
order = scores.argsort(descending=True)
kept = []
while order.numel() > 0:
# Keep the highest-confidence box
best_idx = order[0].item()
kept.append(best_idx)
if order.numel() == 1:
break
# Compute IoU of best box with remaining boxes
best_box = boxes[best_idx].unsqueeze(0) # (1, 4)
rest_boxes = boxes[order[1:]] # (M, 4)
ious = box_iou(best_box.expand_as(rest_boxes), rest_boxes)
# Keep only boxes with IoU < threshold (not too similar to best)
keep_mask = ious < iou_threshold
order = order[1:][keep_mask]
return torch.tensor(kept, dtype=torch.long)
# Example: 5 candidate nodule detections
boxes = torch.tensor([
[10., 20., 50., 80.], # primary detection
[12., 22., 52., 82.], # near-duplicate (high IoU with first)
[100., 50., 150., 110.], # separate region
[13., 21., 51., 81.], # another near-duplicate
[98., 48., 148., 108.], # near-duplicate of third
])
scores = torch.tensor([0.95, 0.82, 0.87, 0.75, 0.79])
kept = non_maximum_suppression(boxes, scores, iou_threshold=0.5)
print(f"Kept detections: {kept.tolist()}") # [0, 2] — one from each cluster
print(f"Kept scores: {scores[kept].tolist()}")YOLO: Real-Time Object Detection
YOLO (You Only Look Once) architecture:
1. Divide image into S×S grid (e.g., 13×13 for 416×416 input)
2. Each grid cell predicts B bounding boxes, each with:
- 4 box parameters (cx, cy, w, h)
- 1 objectness score P(object present)
- C class probabilities
3. Output tensor: (S, S, B × (5 + C))
For S=13, B=3, C=80 (COCO): 13×13×255
4. Anchor boxes: predefined aspect ratios
The network predicts offsets from anchor boxes (not absolute coordinates)
5. Loss = localisation loss + confidence loss + classification loss
6. NMS post-processing to remove duplicatesimport torch
import torch.nn as nn
class YOLOHead(nn.Module):
"""Simplified YOLO detection head."""
def __init__(
self,
in_channels: int,
n_anchors: int = 3,
n_classes: int = 1, # just "nodule" for medical example
):
super().__init__()
n_outputs = n_anchors * (5 + n_classes) # 5 = cx, cy, w, h, conf
self.head = nn.Conv2d(in_channels, n_outputs, kernel_size=1)
self.n_anchors = n_anchors
self.n_classes = n_classes
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""x: (B, C, H, W) feature map from backbone"""
out = self.head(x) # (B, n_anchors*(5+n_classes), H, W)
B, _, H, W = out.shape
# Reshape to (B, H, W, n_anchors, 5+n_classes)
out = out.permute(0, 2, 3, 1).reshape(B, H, W, self.n_anchors, 5 + self.n_classes)
# Apply activations
xy = torch.sigmoid(out[..., :2]) # cx, cy ∈ (0, 1) relative to cell
wh = out[..., 2:4] # log-scale relative to anchor
conf = torch.sigmoid(out[..., 4:5]) # P(object)
cls = torch.sigmoid(out[..., 5:]) # P(class | object)
return torch.cat([xy, wh, conf, cls], dim=-1)
head = YOLOHead(in_channels=512, n_anchors=3, n_classes=1)
feat_map = torch.randn(4, 512, 13, 13)
predictions = head(feat_map)
print(f"YOLO head output: {predictions.shape}") # (4, 13, 13, 3, 7)Medical Object Detection: Lung Nodule
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
def build_nodule_detector(n_classes: int = 2) -> nn.Module:
"""
Faster R-CNN for lung nodule detection.
n_classes = 2 (background + nodule)
"""
# Load pretrained Faster R-CNN
model = fasterrcnn_resnet50_fpn(pretrained=True)
# Replace the classification head
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, n_classes)
return model
# Faster R-CNN expects:
# - Images as list of (C, H, W) tensors
# - Targets as list of dicts: {"boxes": (N, 4), "labels": (N,)}
model = build_nodule_detector()
# Training mode
model.train()
images = [torch.randn(3, 512, 512), torch.randn(3, 512, 512)]
targets = [
{"boxes": torch.tensor([[100., 100., 150., 150.]]), "labels": torch.tensor([1])},
{"boxes": torch.tensor([[200., 200., 280., 280.]]), "labels": torch.tensor([1])},
]
loss_dict = model(images, targets)
print(f"Faster R-CNN losses: {list(loss_dict.keys())}")
total_loss = sum(loss_dict.values())
# Inference mode
model.eval()
with torch.no_grad():
detections = model(images)
print(f"Detection keys: {list(detections[0].keys())}") # boxes, labels, scoresInterview Answer
"Object detection extends image classification to simultaneously locate and classify multiple objects. Key concepts: (1) Bounding boxes represented as [x1, y1, x2, y2] (corners) or [cx, cy, w, h] (center + size); (2) IoU (Intersection over Union) measures prediction quality — IoU > 0.5 is typically accepted as a correct detection; (3) NMS (Non-Maximum Suppression) removes duplicate detections by iteratively keeping the highest-confidence box and discarding overlapping boxes; (4) Two-stage detectors (Faster R-CNN) propose regions then classify each — higher accuracy, slower; one-stage detectors (YOLO) predict boxes and classes simultaneously from a grid — faster, slightly lower accuracy. For medical imaging: lung nodule detection on CT uses Faster R-CNN or LUNA16-trained YOLO; bounding box annotations are expensive so active learning (annotate the most uncertain images) reduces annotation cost. mAP (mean Average Precision) is the standard metric, computed as the area under the precision-recall curve averaged across IoU thresholds."