SQL & Database Mastery · Lesson 2 of 6
Window Functions & CTEs
Why Advanced SQL Matters
Junior developers write queries that return the right results. Senior engineers write queries that return the right results and run in milliseconds on 100 million rows. This guide covers the techniques that separate the two.
We'll continue with the e-commerce database from the fundamentals guide.
Window Functions
Window functions perform calculations across a set of rows related to the current row — without collapsing them into a single group like GROUP BY does.
-- Syntax
function_name() OVER (
PARTITION BY column -- optional: separate window per group
ORDER BY column -- defines row ordering within the window
ROWS/RANGE ... -- optional: frame specification
)ROW_NUMBER — Sequential numbering
-- Rank customers by their total spending (per city)
SELECT
c.name,
c.city,
SUM(o.total_amount) AS total_spent,
ROW_NUMBER() OVER (
PARTITION BY c.city
ORDER BY SUM(o.total_amount) DESC
) AS rank_in_city
FROM customers c
JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id, c.name, c.city
ORDER BY c.city, rank_in_city;Real-world use: "Top 3 customers in each city" report.
RANK and DENSE_RANK
-- RANK: ties get the same rank, next rank is skipped (1,1,3)
-- DENSE_RANK: ties get the same rank, no gaps (1,1,2)
SELECT
name,
total_amount,
RANK() OVER (ORDER BY total_amount DESC) AS rank,
DENSE_RANK() OVER (ORDER BY total_amount DESC) AS dense_rank
FROM orders
ORDER BY total_amount DESC;LAG and LEAD — Access adjacent rows
-- Month-over-month revenue growth
WITH monthly_revenue AS (
SELECT
DATE_TRUNC('month', order_date) AS month,
SUM(total_amount) AS revenue
FROM orders
WHERE status = 'delivered'
GROUP BY 1
)
SELECT
month,
revenue,
LAG(revenue) OVER (ORDER BY month) AS prev_month,
revenue - LAG(revenue) OVER (ORDER BY month) AS absolute_change,
ROUND(
100.0 * (revenue - LAG(revenue) OVER (ORDER BY month))
/ NULLIF(LAG(revenue) OVER (ORDER BY month), 0),
1
) AS pct_change
FROM monthly_revenue
ORDER BY month;Real-world use: Growth reporting in dashboards.
SUM / AVG as window functions — Running totals
-- Running total of revenue
SELECT
order_date,
total_amount,
SUM(total_amount) OVER (ORDER BY order_date) AS running_total
FROM orders
WHERE status = 'delivered'
ORDER BY order_date;
-- 7-day moving average of daily revenue
WITH daily AS (
SELECT order_date, SUM(total_amount) AS daily_revenue
FROM orders
WHERE status = 'delivered'
GROUP BY order_date
)
SELECT
order_date,
daily_revenue,
AVG(daily_revenue) OVER (
ORDER BY order_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) AS moving_avg_7d
FROM daily;NTILE — Divide into buckets
-- Segment customers into 4 spending quartiles
WITH customer_spend AS (
SELECT
customer_id,
SUM(total_amount) AS total_spent
FROM orders
GROUP BY customer_id
)
SELECT
c.name,
cs.total_spent,
NTILE(4) OVER (ORDER BY cs.total_spent DESC) AS quartile
FROM customer_spend cs
JOIN customers c ON cs.customer_id = c.customer_id;
-- Quartile 1 = top 25% spendersFIRST_VALUE / LAST_VALUE
-- Show each order alongside the customer's first and latest order date
SELECT
customer_id,
order_id,
order_date,
FIRST_VALUE(order_date) OVER (
PARTITION BY customer_id ORDER BY order_date
) AS first_order_date,
LAST_VALUE(order_date) OVER (
PARTITION BY customer_id
ORDER BY order_date
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) AS last_order_date
FROM orders;Advanced CTEs
Chained CTEs
-- Customer lifetime value analysis
WITH order_stats AS (
SELECT
customer_id,
COUNT(*) AS order_count,
SUM(total_amount) AS lifetime_value,
MIN(order_date) AS first_order,
MAX(order_date) AS last_order
FROM orders
WHERE status = 'delivered'
GROUP BY customer_id
),
customer_segments AS (
SELECT
customer_id,
order_count,
lifetime_value,
first_order,
last_order,
CASE
WHEN lifetime_value >= 5000 THEN 'VIP'
WHEN lifetime_value >= 1000 THEN 'Loyal'
WHEN order_count = 1 THEN 'One-time'
ELSE 'Regular'
END AS segment
FROM order_stats
)
SELECT
segment,
COUNT(*) AS customer_count,
AVG(lifetime_value) AS avg_ltv,
AVG(order_count) AS avg_orders
FROM customer_segments
GROUP BY segment
ORDER BY avg_ltv DESC;Recursive CTEs — Hierarchies and sequences
-- Generate a date series (no table needed)
WITH RECURSIVE date_series AS (
SELECT '2025-01-01'::DATE AS d
UNION ALL
SELECT d + INTERVAL '1 day'
FROM date_series
WHERE d < '2025-12-31'
)
SELECT d FROM date_series;
-- Employee hierarchy (org chart traversal)
CREATE TABLE employees (
emp_id INT PRIMARY KEY,
name VARCHAR(100),
manager_id INT REFERENCES employees(emp_id)
);
WITH RECURSIVE org_chart AS (
-- Anchor: CEO (no manager)
SELECT emp_id, name, manager_id, 0 AS depth, name AS path
FROM employees
WHERE manager_id IS NULL
UNION ALL
-- Recursive: employees whose manager is in the previous level
SELECT e.emp_id, e.name, e.manager_id, oc.depth + 1,
oc.path || ' > ' || e.name
FROM employees e
JOIN org_chart oc ON e.manager_id = oc.emp_id
)
SELECT
REPEAT(' ', depth) || name AS hierarchy,
depth,
path
FROM org_chart
ORDER BY path;CASE Expressions
-- Categorize orders by size
SELECT
order_id,
total_amount,
CASE
WHEN total_amount >= 1000 THEN 'Large'
WHEN total_amount >= 100 THEN 'Medium'
ELSE 'Small'
END AS order_size
FROM orders;
-- Pivot: count orders by status per month
SELECT
DATE_TRUNC('month', order_date) AS month,
COUNT(CASE WHEN status = 'delivered' THEN 1 END) AS delivered,
COUNT(CASE WHEN status = 'pending' THEN 1 END) AS pending,
COUNT(CASE WHEN status = 'cancelled' THEN 1 END) AS cancelled
FROM orders
GROUP BY 1
ORDER BY 1;Advanced Joins
Self Join
-- Find customers who ordered on the same day
SELECT
a.customer_id AS customer1,
b.customer_id AS customer2,
a.order_date
FROM orders a
JOIN orders b
ON a.order_date = b.order_date
AND a.customer_id < b.customer_id -- avoid duplicates
ORDER BY a.order_date;CROSS JOIN — All combinations
-- Generate all product-customer combinations for a targeted campaign
SELECT c.name AS customer, p.name AS product
FROM customers c
CROSS JOIN products p
WHERE c.tier = 'pro'
AND p.category = 'Electronics';Lateral Join / APPLY
-- PostgreSQL: top 3 orders per customer using LATERAL
SELECT c.name, top_orders.order_id, top_orders.total_amount
FROM customers c
JOIN LATERAL (
SELECT order_id, total_amount
FROM orders
WHERE customer_id = c.customer_id
ORDER BY total_amount DESC
LIMIT 3
) top_orders ON true;Gaps and Islands
A classic pattern for finding consecutive sequences in time-series data.
-- Find date ranges where each product was in stock continuously
WITH inventory_log AS (
SELECT product_id, date, stock > 0 AS in_stock
FROM daily_inventory
),
grouped AS (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY product_id ORDER BY date) -
ROW_NUMBER() OVER (PARTITION BY product_id, in_stock ORDER BY date) AS grp
FROM inventory_log
)
SELECT
product_id,
in_stock,
MIN(date) AS period_start,
MAX(date) AS period_end
FROM grouped
GROUP BY product_id, in_stock, grp
ORDER BY product_id, period_start;Performance Tuning
EXPLAIN — Understanding execution plans
-- See how the database will execute a query
EXPLAIN SELECT * FROM orders WHERE customer_id = 42;
-- Get actual run times (execute the query)
EXPLAIN ANALYZE SELECT * FROM orders WHERE customer_id = 42;What to look for:
- Seq Scan on large tables = missing index
- Hash Join vs Nested Loop — hash join is better for large datasets
- Sort operations — can be eliminated with the right index
- Rows estimate vs actual — large divergence = stale statistics
Index strategies
-- Partial index: only index active orders (much smaller, faster)
CREATE INDEX idx_active_orders ON orders(customer_id)
WHERE status NOT IN ('delivered', 'cancelled');
-- Covering index: include columns to avoid table lookup
CREATE INDEX idx_orders_covering ON orders(customer_id)
INCLUDE (total_amount, order_date, status);
-- Expression index
CREATE INDEX idx_lower_email ON customers(LOWER(email));
-- Enables: WHERE LOWER(email) = 'user@example.com'
-- Composite index: order matters
CREATE INDEX idx_orders_date_status ON orders(order_date, status);
-- Efficient for: WHERE order_date = X AND status = Y
-- Also efficient for: WHERE order_date = X
-- NOT efficient for: WHERE status = Y aloneQuery optimization patterns
-- BAD: function on indexed column kills the index
SELECT * FROM orders WHERE YEAR(order_date) = 2025;
-- GOOD: use a range instead
SELECT * FROM orders
WHERE order_date >= '2025-01-01' AND order_date < '2026-01-01';
-- BAD: SELECT * pulls unused columns
SELECT * FROM orders WHERE customer_id = 42;
-- GOOD: only fetch what you need
SELECT order_id, order_date, total_amount
FROM orders WHERE customer_id = 42;
-- BAD: correlated subquery executes once per row
SELECT name,
(SELECT COUNT(*) FROM orders WHERE customer_id = c.customer_id) AS cnt
FROM customers c;
-- GOOD: JOIN is evaluated once
SELECT c.name, COUNT(o.order_id) AS cnt
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id, c.name;DISTINCT and Deduplication
-- Unique cities where we have customers
SELECT DISTINCT city FROM customers ORDER BY city;
-- Distinct on one column (keep first row per group) — PostgreSQL
SELECT DISTINCT ON (customer_id) customer_id, order_id, order_date
FROM orders
ORDER BY customer_id, order_date DESC; -- keeps most recent order per customer
-- Remove duplicate rows (keep one)
WITH numbered AS (
SELECT *, ROW_NUMBER() OVER (
PARTITION BY email
ORDER BY customer_id
) AS rn
FROM customers
)
DELETE FROM customers
WHERE customer_id IN (
SELECT customer_id FROM numbered WHERE rn > 1
);String Aggregation
-- Comma-separated list of products per order
SELECT
o.order_id,
STRING_AGG(p.name, ', ' ORDER BY p.name) AS products
FROM orders o
JOIN order_items oi ON o.order_id = oi.order_id
JOIN products p ON oi.product_id = p.product_id
GROUP BY o.order_id;
-- MySQL equivalent
SELECT
o.order_id,
GROUP_CONCAT(p.name ORDER BY p.name SEPARATOR ', ') AS products
FROM orders o
JOIN order_items oi ON o.order_id = oi.order_id
JOIN products p ON oi.product_id = p.product_id
GROUP BY o.order_id;JSON in SQL (PostgreSQL)
Modern databases support JSON columns for semi-structured data.
-- Store and query product metadata as JSON
ALTER TABLE products ADD COLUMN metadata JSONB;
UPDATE products SET metadata = '{"color": "red", "weight_kg": 1.5}'
WHERE product_id = 1;
-- Query by JSON field
SELECT name FROM products
WHERE metadata->>'color' = 'red';
-- Index a JSON field
CREATE INDEX idx_product_color ON products((metadata->>'color'));
-- Aggregate JSON data
SELECT
metadata->>'color' AS color,
COUNT(*) AS count,
AVG(price) AS avg_price
FROM products
WHERE metadata IS NOT NULL
GROUP BY metadata->>'color';Production-Ready Query Template
Here's a comprehensive reporting query that combines everything:
-- Weekly cohort retention analysis
WITH cohorts AS (
SELECT
customer_id,
DATE_TRUNC('week', MIN(order_date)) AS cohort_week
FROM orders
GROUP BY customer_id
),
cohort_activity AS (
SELECT
c.cohort_week,
DATE_TRUNC('week', o.order_date) AS activity_week,
COUNT(DISTINCT o.customer_id) AS active_customers
FROM orders o
JOIN cohorts c ON o.customer_id = c.customer_id
GROUP BY 1, 2
),
cohort_sizes AS (
SELECT cohort_week, COUNT(*) AS cohort_size
FROM cohorts
GROUP BY 1
)
SELECT
ca.cohort_week,
cs.cohort_size,
ca.activity_week,
EXTRACT(WEEK FROM (ca.activity_week - ca.cohort_week)) AS week_number,
ca.active_customers,
ROUND(100.0 * ca.active_customers / cs.cohort_size, 1) AS retention_pct
FROM cohort_activity ca
JOIN cohort_sizes cs ON ca.cohort_week = cs.cohort_week
ORDER BY ca.cohort_week, ca.activity_week;This type of cohort analysis is used by product teams at every SaaS company.
Next Steps
- SQL Interview Prep (Medium): Practice the patterns you've learned here with 100 interview questions
- SQL Real-World Project: Build a complete e-commerce analytics database end-to-end
- SQL Interview Prep (Advanced): Window functions, recursive queries, optimization challenges