Deep Learning for AI Interviews · Lesson 46 of 56

Object Detection: YOLO and Faster R-CNN Concepts

Classification vs Detection vs Segmentation

Image Classification:
  Input: image
  Output: class label (and probability)
  Example: "this is a chest X-ray showing pneumonia"

Object Detection:
  Input: image
  Output: list of (class, bounding box, confidence) for all objects
  Example: [(nodule, [x1=120, y1=80, x2=180, y2=140], 0.93)]

Semantic Segmentation:
  Input: image
  Output: class label per pixel
  Example: each pixel labelled as {background, lung, heart, nodule}

Instance Segmentation:
  Input: image
  Output: per-pixel class + instance ID (distinguishes individual objects)
  Example: nodule_1 and nodule_2 as separate masks

Bounding Box Representation

Python

import torch

# Bounding box formats:
# XYXY: [x_min, y_min, x_max, y_max]  — corners
# XYWH: [x_center, y_center, width, height]  — center + size (YOLO format)
# CXCYWH same as XYWH with explicit naming

def xyxy_to_xywh(boxes: torch.Tensor) -> torch.Tensor:
    """Convert [x1, y1, x2, y2] to [cx, cy, w, h]."""
    x1, y1, x2, y2 = boxes.unbind(dim=-1)
    return torch.stack([
        (x1 + x2) / 2,   # cx
        (y1 + y2) / 2,   # cy
        x2 - x1,          # w
        y2 - y1,           # h
    ], dim=-1)

def xywh_to_xyxy(boxes: torch.Tensor) -> torch.Tensor:
    """Convert [cx, cy, w, h] to [x1, y1, x2, y2]."""
    cx, cy, w, h = boxes.unbind(dim=-1)
    return torch.stack([cx - w/2, cy - h/2, cx + w/2, cy + h/2], dim=-1)

# IoU: Intersection over Union — measures bounding box overlap
def box_iou(boxes_a: torch.Tensor, boxes_b: torch.Tensor) -> torch.Tensor:
    """Compute IoU for pairs of boxes (XYXY format)."""
    # Intersection
    inter_x1 = torch.max(boxes_a[:, 0], boxes_b[:, 0])
    inter_y1 = torch.max(boxes_a[:, 1], boxes_b[:, 1])
    inter_x2 = torch.min(boxes_a[:, 2], boxes_b[:, 2])
    inter_y2 = torch.min(boxes_a[:, 3], boxes_b[:, 3])
    
    inter_area = (inter_x2 - inter_x1).clamp(0) * (inter_y2 - inter_y1).clamp(0)
    
    # Union
    area_a = (boxes_a[:, 2] - boxes_a[:, 0]) * (boxes_a[:, 3] - boxes_a[:, 1])
    area_b = (boxes_b[:, 2] - boxes_b[:, 0]) * (boxes_b[:, 3] - boxes_b[:, 1])
    union_area = area_a + area_b - inter_area
    
    return inter_area / (union_area + 1e-6)

# Example
pred_box = torch.tensor([[10., 20., 50., 80.]])   # predicted box
true_box = torch.tensor([[15., 25., 55., 85.]])   # ground truth
iou = box_iou(pred_box, true_box)
print(f"IoU: {iou.item():.4f}")   # ~0.7 for mostly overlapping boxes

Non-Maximum Suppression

Python

import torch

def non_maximum_suppression(
    boxes: torch.Tensor,      # (N, 4) XYXY format
    scores: torch.Tensor,     # (N,) confidence scores
    iou_threshold: float = 0.5,
) -> torch.Tensor:
    """
    Remove duplicate detections by keeping the highest-confidence box
    and suppressing overlapping boxes with IoU > threshold.
    Returns indices of kept boxes.
    """
    # Sort by confidence descending
    order = scores.argsort(descending=True)
    kept = []
    
    while order.numel() > 0:
        # Keep the highest-confidence box
        best_idx = order[0].item()
        kept.append(best_idx)
        
        if order.numel() == 1:
            break
        
        # Compute IoU of best box with remaining boxes
        best_box  = boxes[best_idx].unsqueeze(0)    # (1, 4)
        rest_boxes = boxes[order[1:]]                # (M, 4)
        ious = box_iou(best_box.expand_as(rest_boxes), rest_boxes)
        
        # Keep only boxes with IoU < threshold (not too similar to best)
        keep_mask = ious < iou_threshold
        order = order[1:][keep_mask]
    
    return torch.tensor(kept, dtype=torch.long)

# Example: 5 candidate nodule detections
boxes = torch.tensor([
    [10., 20., 50., 80.],   # primary detection
    [12., 22., 52., 82.],   # near-duplicate (high IoU with first)
    [100., 50., 150., 110.],  # separate region
    [13., 21., 51., 81.],   # another near-duplicate
    [98., 48., 148., 108.],  # near-duplicate of third
])
scores = torch.tensor([0.95, 0.82, 0.87, 0.75, 0.79])

kept = non_maximum_suppression(boxes, scores, iou_threshold=0.5)
print(f"Kept detections: {kept.tolist()}")  # [0, 2] — one from each cluster
print(f"Kept scores: {scores[kept].tolist()}")

YOLO: Real-Time Object Detection

YOLO (You Only Look Once) architecture:

1. Divide image into S×S grid (e.g., 13×13 for 416×416 input)

2. Each grid cell predicts B bounding boxes, each with:
   - 4 box parameters (cx, cy, w, h)
   - 1 objectness score P(object present)
   - C class probabilities

3. Output tensor: (S, S, B × (5 + C))
   For S=13, B=3, C=80 (COCO): 13×13×255

4. Anchor boxes: predefined aspect ratios
   The network predicts offsets from anchor boxes (not absolute coordinates)

5. Loss = localisation loss + confidence loss + classification loss

6. NMS post-processing to remove duplicates

Python

import torch
import torch.nn as nn

class YOLOHead(nn.Module):
    """Simplified YOLO detection head."""
    
    def __init__(
        self,
        in_channels: int,
        n_anchors: int = 3,
        n_classes: int = 1,   # just "nodule" for medical example
    ):
        super().__init__()
        n_outputs = n_anchors * (5 + n_classes)  # 5 = cx, cy, w, h, conf
        self.head = nn.Conv2d(in_channels, n_outputs, kernel_size=1)
        self.n_anchors = n_anchors
        self.n_classes = n_classes
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """x: (B, C, H, W) feature map from backbone"""
        out = self.head(x)   # (B, n_anchors*(5+n_classes), H, W)
        B, _, H, W = out.shape
        
        # Reshape to (B, H, W, n_anchors, 5+n_classes)
        out = out.permute(0, 2, 3, 1).reshape(B, H, W, self.n_anchors, 5 + self.n_classes)
        
        # Apply activations
        xy     = torch.sigmoid(out[..., :2])      # cx, cy ∈ (0, 1) relative to cell
        wh     = out[..., 2:4]                     # log-scale relative to anchor
        conf   = torch.sigmoid(out[..., 4:5])      # P(object)
        cls    = torch.sigmoid(out[..., 5:])       # P(class | object)
        
        return torch.cat([xy, wh, conf, cls], dim=-1)

head = YOLOHead(in_channels=512, n_anchors=3, n_classes=1)
feat_map = torch.randn(4, 512, 13, 13)
predictions = head(feat_map)
print(f"YOLO head output: {predictions.shape}")  # (4, 13, 13, 3, 7)

Medical Object Detection: Lung Nodule

Python

import torch
import torch.nn as nn
import torchvision.models as models
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

def build_nodule_detector(n_classes: int = 2) -> nn.Module:
    """
    Faster R-CNN for lung nodule detection.
    n_classes = 2 (background + nodule)
    """
    # Load pretrained Faster R-CNN
    model = fasterrcnn_resnet50_fpn(pretrained=True)
    
    # Replace the classification head
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, n_classes)
    
    return model

# Faster R-CNN expects:
# - Images as list of (C, H, W) tensors
# - Targets as list of dicts: {"boxes": (N, 4), "labels": (N,)}
model = build_nodule_detector()

# Training mode
model.train()
images = [torch.randn(3, 512, 512), torch.randn(3, 512, 512)]
targets = [
    {"boxes": torch.tensor([[100., 100., 150., 150.]]), "labels": torch.tensor([1])},
    {"boxes": torch.tensor([[200., 200., 280., 280.]]), "labels": torch.tensor([1])},
]
loss_dict = model(images, targets)
print(f"Faster R-CNN losses: {list(loss_dict.keys())}")
total_loss = sum(loss_dict.values())

# Inference mode
model.eval()
with torch.no_grad():
    detections = model(images)
print(f"Detection keys: {list(detections[0].keys())}")  # boxes, labels, scores

Interview Answer

"Object detection extends image classification to simultaneously locate and classify multiple objects. Key concepts: (1) Bounding boxes represented as [x1, y1, x2, y2] (corners) or [cx, cy, w, h] (center + size); (2) IoU (Intersection over Union) measures prediction quality — IoU > 0.5 is typically accepted as a correct detection; (3) NMS (Non-Maximum Suppression) removes duplicate detections by iteratively keeping the highest-confidence box and discarding overlapping boxes; (4) Two-stage detectors (Faster R-CNN) propose regions then classify each — higher accuracy, slower; one-stage detectors (YOLO) predict boxes and classes simultaneously from a grid — faster, slightly lower accuracy. For medical imaging: lung nodule detection on CT uses Faster R-CNN or LUNA16-trained YOLO; bounding box annotations are expensive so active learning (annotate the most uncertain images) reduces annotation cost. mAP (mean Average Precision) is the standard metric, computed as the area under the precision-recall curve averaged across IoU thresholds."

ResNet, VGG, and Skip Connections

Next Lesson

Interview: Design a CNN for Image Classification