CTEs, Window Functions, and Analytics — SQL Mastery for Data Engineering | Learnixo

Why Advanced SQL Matters

Junior developers write queries that return the right results. Senior engineers write queries that return the right results and run in milliseconds on 100 million rows. This guide covers the techniques that separate the two.

We'll continue with the e-commerce database from the fundamentals guide.

Window Functions

Window functions perform calculations across a set of rows related to the current row — without collapsing them into a single group like GROUP BY does.

SQL

-- Syntax
function_name() OVER (
  PARTITION BY column  -- optional: separate window per group
  ORDER BY column      -- defines row ordering within the window
  ROWS/RANGE ...       -- optional: frame specification
)

ROW_NUMBER — Sequential numbering

SQL

-- Rank customers by their total spending (per city)
SELECT
  c.name,
  c.city,
  SUM(o.total_amount) AS total_spent,
  ROW_NUMBER() OVER (
    PARTITION BY c.city
    ORDER BY SUM(o.total_amount) DESC
  ) AS rank_in_city
FROM customers c
JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id, c.name, c.city
ORDER BY c.city, rank_in_city;

Real-world use: "Top 3 customers in each city" report.

RANK and DENSE_RANK

SQL

-- RANK: ties get the same rank, next rank is skipped (1,1,3)
-- DENSE_RANK: ties get the same rank, no gaps (1,1,2)

SELECT
  name,
  total_amount,
  RANK()       OVER (ORDER BY total_amount DESC) AS rank,
  DENSE_RANK() OVER (ORDER BY total_amount DESC) AS dense_rank
FROM orders
ORDER BY total_amount DESC;

LAG and LEAD — Access adjacent rows

SQL

-- Month-over-month revenue growth
WITH monthly_revenue AS (
  SELECT
    DATE_TRUNC('month', order_date) AS month,
    SUM(total_amount) AS revenue
  FROM orders
  WHERE status = 'delivered'
  GROUP BY 1
)
SELECT
  month,
  revenue,
  LAG(revenue) OVER (ORDER BY month) AS prev_month,
  revenue - LAG(revenue) OVER (ORDER BY month) AS absolute_change,
  ROUND(
    100.0 * (revenue - LAG(revenue) OVER (ORDER BY month))
    / NULLIF(LAG(revenue) OVER (ORDER BY month), 0),
    1
  ) AS pct_change
FROM monthly_revenue
ORDER BY month;

Real-world use: Growth reporting in dashboards.

SUM / AVG as window functions — Running totals

SQL

-- Running total of revenue
SELECT
  order_date,
  total_amount,
  SUM(total_amount) OVER (ORDER BY order_date) AS running_total
FROM orders
WHERE status = 'delivered'
ORDER BY order_date;

-- 7-day moving average of daily revenue
WITH daily AS (
  SELECT order_date, SUM(total_amount) AS daily_revenue
  FROM orders
  WHERE status = 'delivered'
  GROUP BY order_date
)
SELECT
  order_date,
  daily_revenue,
  AVG(daily_revenue) OVER (
    ORDER BY order_date
    ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
  ) AS moving_avg_7d
FROM daily;

NTILE — Divide into buckets

SQL

-- Segment customers into 4 spending quartiles
WITH customer_spend AS (
  SELECT
    customer_id,
    SUM(total_amount) AS total_spent
  FROM orders
  GROUP BY customer_id
)
SELECT
  c.name,
  cs.total_spent,
  NTILE(4) OVER (ORDER BY cs.total_spent DESC) AS quartile
FROM customer_spend cs
JOIN customers c ON cs.customer_id = c.customer_id;
-- Quartile 1 = top 25% spenders

FIRST_VALUE / LAST_VALUE

SQL

-- Show each order alongside the customer's first and latest order date
SELECT
  customer_id,
  order_id,
  order_date,
  FIRST_VALUE(order_date) OVER (
    PARTITION BY customer_id ORDER BY order_date
  ) AS first_order_date,
  LAST_VALUE(order_date) OVER (
    PARTITION BY customer_id
    ORDER BY order_date
    ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
  ) AS last_order_date
FROM orders;

Advanced CTEs

Chained CTEs

SQL

-- Customer lifetime value analysis
WITH order_stats AS (
  SELECT
    customer_id,
    COUNT(*)          AS order_count,
    SUM(total_amount) AS lifetime_value,
    MIN(order_date)   AS first_order,
    MAX(order_date)   AS last_order
  FROM orders
  WHERE status = 'delivered'
  GROUP BY customer_id
),
customer_segments AS (
  SELECT
    customer_id,
    order_count,
    lifetime_value,
    first_order,
    last_order,
    CASE
      WHEN lifetime_value >= 5000 THEN 'VIP'
      WHEN lifetime_value >= 1000 THEN 'Loyal'
      WHEN order_count = 1        THEN 'One-time'
      ELSE 'Regular'
    END AS segment
  FROM order_stats
)
SELECT
  segment,
  COUNT(*)             AS customer_count,
  AVG(lifetime_value)  AS avg_ltv,
  AVG(order_count)     AS avg_orders
FROM customer_segments
GROUP BY segment
ORDER BY avg_ltv DESC;

Recursive CTEs — Hierarchies and sequences

SQL

-- Generate a date series (no table needed)
WITH RECURSIVE date_series AS (
  SELECT '2025-01-01'::DATE AS d
  UNION ALL
  SELECT d + INTERVAL '1 day'
  FROM date_series
  WHERE d < '2025-12-31'
)
SELECT d FROM date_series;

-- Employee hierarchy (org chart traversal)
CREATE TABLE employees (
  emp_id     INT PRIMARY KEY,
  name       VARCHAR(100),
  manager_id INT REFERENCES employees(emp_id)
);

WITH RECURSIVE org_chart AS (
  -- Anchor: CEO (no manager)
  SELECT emp_id, name, manager_id, 0 AS depth, name AS path
  FROM employees
  WHERE manager_id IS NULL

  UNION ALL

  -- Recursive: employees whose manager is in the previous level
  SELECT e.emp_id, e.name, e.manager_id, oc.depth + 1,
         oc.path || ' > ' || e.name
  FROM employees e
  JOIN org_chart oc ON e.manager_id = oc.emp_id
)
SELECT
  REPEAT('  ', depth) || name AS hierarchy,
  depth,
  path
FROM org_chart
ORDER BY path;

CASE Expressions

SQL

-- Categorize orders by size
SELECT
  order_id,
  total_amount,
  CASE
    WHEN total_amount >= 1000 THEN 'Large'
    WHEN total_amount >= 100  THEN 'Medium'
    ELSE 'Small'
  END AS order_size
FROM orders;

-- Pivot: count orders by status per month
SELECT
  DATE_TRUNC('month', order_date) AS month,
  COUNT(CASE WHEN status = 'delivered'  THEN 1 END) AS delivered,
  COUNT(CASE WHEN status = 'pending'    THEN 1 END) AS pending,
  COUNT(CASE WHEN status = 'cancelled'  THEN 1 END) AS cancelled
FROM orders
GROUP BY 1
ORDER BY 1;

Advanced Joins

Self Join

SQL

-- Find customers who ordered on the same day
SELECT
  a.customer_id AS customer1,
  b.customer_id AS customer2,
  a.order_date
FROM orders a
JOIN orders b
  ON a.order_date = b.order_date
  AND a.customer_id < b.customer_id  -- avoid duplicates
ORDER BY a.order_date;

CROSS JOIN — All combinations

SQL

-- Generate all product-customer combinations for a targeted campaign
SELECT c.name AS customer, p.name AS product
FROM customers c
CROSS JOIN products p
WHERE c.tier = 'pro'
  AND p.category = 'Electronics';

Lateral Join / APPLY

SQL

-- PostgreSQL: top 3 orders per customer using LATERAL
SELECT c.name, top_orders.order_id, top_orders.total_amount
FROM customers c
JOIN LATERAL (
  SELECT order_id, total_amount
  FROM orders
  WHERE customer_id = c.customer_id
  ORDER BY total_amount DESC
  LIMIT 3
) top_orders ON true;

Gaps and Islands

A classic pattern for finding consecutive sequences in time-series data.

SQL

-- Find date ranges where each product was in stock continuously
WITH inventory_log AS (
  SELECT product_id, date, stock > 0 AS in_stock
  FROM daily_inventory
),
grouped AS (
  SELECT *,
    ROW_NUMBER() OVER (PARTITION BY product_id ORDER BY date) -
    ROW_NUMBER() OVER (PARTITION BY product_id, in_stock ORDER BY date) AS grp
  FROM inventory_log
)
SELECT
  product_id,
  in_stock,
  MIN(date) AS period_start,
  MAX(date) AS period_end
FROM grouped
GROUP BY product_id, in_stock, grp
ORDER BY product_id, period_start;

Performance Tuning

EXPLAIN — Understanding execution plans

SQL

-- See how the database will execute a query
EXPLAIN SELECT * FROM orders WHERE customer_id = 42;

-- Get actual run times (execute the query)
EXPLAIN ANALYZE SELECT * FROM orders WHERE customer_id = 42;

What to look for:

Seq Scan on large tables = missing index
Hash Join vs Nested Loop — hash join is better for large datasets
Sort operations — can be eliminated with the right index
Rows estimate vs actual — large divergence = stale statistics

Index strategies

SQL

-- Partial index: only index active orders (much smaller, faster)
CREATE INDEX idx_active_orders ON orders(customer_id)
WHERE status NOT IN ('delivered', 'cancelled');

-- Covering index: include columns to avoid table lookup
CREATE INDEX idx_orders_covering ON orders(customer_id)
INCLUDE (total_amount, order_date, status);

-- Expression index
CREATE INDEX idx_lower_email ON customers(LOWER(email));
-- Enables: WHERE LOWER(email) = 'user@example.com'

-- Composite index: order matters
CREATE INDEX idx_orders_date_status ON orders(order_date, status);
-- Efficient for: WHERE order_date = X AND status = Y
-- Also efficient for: WHERE order_date = X
-- NOT efficient for: WHERE status = Y alone

Query optimization patterns

SQL

-- BAD: function on indexed column kills the index
SELECT * FROM orders WHERE YEAR(order_date) = 2025;

-- GOOD: use a range instead
SELECT * FROM orders
WHERE order_date >= '2025-01-01' AND order_date < '2026-01-01';

-- BAD: SELECT * pulls unused columns
SELECT * FROM orders WHERE customer_id = 42;

-- GOOD: only fetch what you need
SELECT order_id, order_date, total_amount
FROM orders WHERE customer_id = 42;

-- BAD: correlated subquery executes once per row
SELECT name,
  (SELECT COUNT(*) FROM orders WHERE customer_id = c.customer_id) AS cnt
FROM customers c;

-- GOOD: JOIN is evaluated once
SELECT c.name, COUNT(o.order_id) AS cnt
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id, c.name;

DISTINCT and Deduplication

SQL

-- Unique cities where we have customers
SELECT DISTINCT city FROM customers ORDER BY city;

-- Distinct on one column (keep first row per group) — PostgreSQL
SELECT DISTINCT ON (customer_id) customer_id, order_id, order_date
FROM orders
ORDER BY customer_id, order_date DESC;  -- keeps most recent order per customer

-- Remove duplicate rows (keep one)
WITH numbered AS (
  SELECT *, ROW_NUMBER() OVER (
    PARTITION BY email
    ORDER BY customer_id
  ) AS rn
  FROM customers
)
DELETE FROM customers
WHERE customer_id IN (
  SELECT customer_id FROM numbered WHERE rn > 1
);

String Aggregation

SQL

-- Comma-separated list of products per order
SELECT
  o.order_id,
  STRING_AGG(p.name, ', ' ORDER BY p.name) AS products
FROM orders o
JOIN order_items oi ON o.order_id = oi.order_id
JOIN products p     ON oi.product_id = p.product_id
GROUP BY o.order_id;

-- MySQL equivalent
SELECT
  o.order_id,
  GROUP_CONCAT(p.name ORDER BY p.name SEPARATOR ', ') AS products
FROM orders o
JOIN order_items oi ON o.order_id = oi.order_id
JOIN products p     ON oi.product_id = p.product_id
GROUP BY o.order_id;

JSON in SQL (PostgreSQL)

Modern databases support JSON columns for semi-structured data.

SQL

-- Store and query product metadata as JSON
ALTER TABLE products ADD COLUMN metadata JSONB;

UPDATE products SET metadata = '{"color": "red", "weight_kg": 1.5}'
WHERE product_id = 1;

-- Query by JSON field
SELECT name FROM products
WHERE metadata->>'color' = 'red';

-- Index a JSON field
CREATE INDEX idx_product_color ON products((metadata->>'color'));

-- Aggregate JSON data
SELECT
  metadata->>'color' AS color,
  COUNT(*) AS count,
  AVG(price) AS avg_price
FROM products
WHERE metadata IS NOT NULL
GROUP BY metadata->>'color';

Production-Ready Query Template

Here's a comprehensive reporting query that combines everything:

SQL

-- Weekly cohort retention analysis
WITH cohorts AS (
  SELECT
    customer_id,
    DATE_TRUNC('week', MIN(order_date)) AS cohort_week
  FROM orders
  GROUP BY customer_id
),
cohort_activity AS (
  SELECT
    c.cohort_week,
    DATE_TRUNC('week', o.order_date) AS activity_week,
    COUNT(DISTINCT o.customer_id) AS active_customers
  FROM orders o
  JOIN cohorts c ON o.customer_id = c.customer_id
  GROUP BY 1, 2
),
cohort_sizes AS (
  SELECT cohort_week, COUNT(*) AS cohort_size
  FROM cohorts
  GROUP BY 1
)
SELECT
  ca.cohort_week,
  cs.cohort_size,
  ca.activity_week,
  EXTRACT(WEEK FROM (ca.activity_week - ca.cohort_week)) AS week_number,
  ca.active_customers,
  ROUND(100.0 * ca.active_customers / cs.cohort_size, 1) AS retention_pct
FROM cohort_activity ca
JOIN cohort_sizes cs ON ca.cohort_week = cs.cohort_week
ORDER BY ca.cohort_week, ca.activity_week;

This type of cohort analysis is used by product teams at every SaaS company.

Next Steps

SQL Interview Prep (Medium): Practice the patterns you've learned here with 100 interview questions
SQL Real-World Project: Build a complete e-commerce analytics database end-to-end
SQL Interview Prep (Advanced): Window functions, recursive queries, optimization challenges