Data Engineering SQL Interview Questions (50 Hard Questions)

How to Use This Guide

These questions are harder than typical FAANG SQL rounds. They test the SQL skills that matter in data engineering: pipeline logic, data modelling, and production query patterns. Each has a complete SQL answer you can run and study.

Part 1: Window Functions (10 Questions)

Q1: Compute running total of revenue, reset each month

SQL

SELECT
    order_date,
    order_id,
    revenue,
    SUM(revenue) OVER (
        PARTITION BY DATE_TRUNC('month', order_date)
        ORDER BY order_date, order_id
        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
    ) AS monthly_running_total
FROM orders
ORDER BY order_date, order_id;

Key point: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW is more deterministic than RANGE BETWEEN when you have ties in the ORDER BY. Always specify the frame explicitly.

Q2: Rank customers by revenue within each region; show top 3 per region

SQL

WITH ranked AS (
    SELECT
        region,
        customer_id,
        SUM(revenue) AS total_revenue,
        RANK() OVER (
            PARTITION BY region
            ORDER BY SUM(revenue) DESC
        ) AS revenue_rank
    FROM orders
    GROUP BY region, customer_id
)
SELECT region, customer_id, total_revenue, revenue_rank
FROM ranked
WHERE revenue_rank <= 3
ORDER BY region, revenue_rank;

RANK vs DENSE_RANK vs ROW_NUMBER: RANK skips numbers after ties (1,1,3). DENSE_RANK does not (1,1,2). ROW_NUMBER assigns unique numbers regardless of ties — only use ROW_NUMBER when you want exactly N rows and ties don't matter.

Q3: Detect gaps in a sequence of event IDs

SQL

WITH numbered AS (
    SELECT
        event_id,
        LAG(event_id) OVER (ORDER BY event_id) AS prev_event_id
    FROM events
)
SELECT
    prev_event_id + 1  AS gap_start,
    event_id - 1       AS gap_end,
    event_id - prev_event_id - 1 AS gap_size
FROM numbered
WHERE event_id - prev_event_id > 1
ORDER BY gap_start;

Interview follow-up: "What if event_id isn't sequential but you want gaps in time?" Use LAG on a timestamp and look for DATEDIFF > expected_interval.

Q4: Return the first and last purchase date per customer

SQL

SELECT DISTINCT
    customer_id,
    FIRST_VALUE(order_date) OVER (
        PARTITION BY customer_id
        ORDER BY order_date
        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    ) AS first_purchase,
    LAST_VALUE(order_date) OVER (
        PARTITION BY customer_id
        ORDER BY order_date
        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
    ) AS last_purchase
FROM orders;

Gotcha: LAST_VALUE without specifying ROWS BETWEEN ... UNBOUNDED FOLLOWING gives you the current row's value, not the last in the partition. Always specify the full frame for LAST_VALUE.

Q5: Calculate 7-day rolling average revenue (excluding weekends)

SQL

WITH daily_revenue AS (
    SELECT
        order_date,
        SUM(revenue) AS daily_rev
    FROM orders
    WHERE DAYOFWEEK(order_date) NOT IN (1, 7)  -- exclude Sun=1, Sat=7 (MySQL syntax)
    GROUP BY order_date
)
SELECT
    order_date,
    daily_rev,
    AVG(daily_rev) OVER (
        ORDER BY order_date
        ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
    ) AS rolling_7d_avg
FROM daily_revenue
ORDER BY order_date;

Q6: Find the customer's nth purchase (parameterised)

SQL

-- Find each customer's 3rd purchase
WITH ranked_purchases AS (
    SELECT
        customer_id,
        order_id,
        order_date,
        ROW_NUMBER() OVER (
            PARTITION BY customer_id
            ORDER BY order_date, order_id
        ) AS purchase_num
    FROM orders
)
SELECT customer_id, order_id, order_date
FROM ranked_purchases
WHERE purchase_num = 3;

Q7: Calculate percentile of each order's value within its product category

SQL

SELECT
    order_id,
    product_category,
    order_value,
    PERCENT_RANK() OVER (
        PARTITION BY product_category
        ORDER BY order_value
    ) AS pct_rank,
    NTILE(100) OVER (
        PARTITION BY product_category
        ORDER BY order_value
    ) AS percentile_bucket
FROM orders;

PERCENT_RANK vs NTILE: PERCENT_RANK gives exact relative position (0 to 1). NTILE(100) assigns rows to 100 buckets — simpler for "top 10%" queries but less precise.

Q8: Compute month-over-month revenue growth rate

SQL

WITH monthly AS (
    SELECT
        DATE_TRUNC('month', order_date) AS month,
        SUM(revenue)                     AS monthly_revenue
    FROM orders
    GROUP BY 1
)
SELECT
    month,
    monthly_revenue,
    LAG(monthly_revenue) OVER (ORDER BY month) AS prev_month_revenue,
    ROUND(
        (monthly_revenue - LAG(monthly_revenue) OVER (ORDER BY month))
        / NULLIF(LAG(monthly_revenue) OVER (ORDER BY month), 0) * 100,
        2
    ) AS mom_growth_pct
FROM monthly
ORDER BY month;

Q9: Identify sessions from clickstream events (30-min inactivity = new session)

SQL

WITH with_prev AS (
    SELECT
        user_id,
        event_time,
        LAG(event_time) OVER (PARTITION BY user_id ORDER BY event_time) AS prev_event_time
    FROM clickstream
),
with_session_flag AS (
    SELECT
        user_id,
        event_time,
        CASE
            WHEN prev_event_time IS NULL
              OR DATEDIFF('minute', prev_event_time, event_time) > 30
            THEN 1
            ELSE 0
        END AS new_session
    FROM with_prev
)
SELECT
    user_id,
    event_time,
    SUM(new_session) OVER (
        PARTITION BY user_id
        ORDER BY event_time
        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
    ) AS session_id
FROM with_session_flag;

Q10: Find customers whose last order was their highest-value order

SQL

WITH ranked AS (
    SELECT
        customer_id,
        order_id,
        order_value,
        ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY order_date DESC) AS recency_rank,
        RANK()       OVER (PARTITION BY customer_id ORDER BY order_value DESC) AS value_rank
    FROM orders
)
SELECT customer_id, order_id, order_value
FROM ranked
WHERE recency_rank = 1   -- most recent order
  AND value_rank   = 1;  -- also their highest value order

Part 2: CTEs and Recursion (10 Questions)

Q11: Recursive CTE — traverse an org chart to find all reports under a manager

SQL

WITH RECURSIVE org_tree AS (
    -- Anchor: start with the target manager
    SELECT employee_id, manager_id, name, 0 AS depth
    FROM employees
    WHERE employee_id = 1001  -- root manager

    UNION ALL

    -- Recursive: join to find direct reports
    SELECT e.employee_id, e.manager_id, e.name, ot.depth + 1
    FROM employees e
    INNER JOIN org_tree ot ON e.manager_id = ot.employee_id
)
SELECT employee_id, name, depth
FROM org_tree
ORDER BY depth, name;

Q12: De-duplicate rows keeping the most recent record per key

SQL

WITH ranked AS (
    SELECT
        *,
        ROW_NUMBER() OVER (
            PARTITION BY customer_id
            ORDER BY updated_at DESC
        ) AS rn
    FROM customers_raw
)
SELECT * EXCLUDE (rn)
FROM ranked
WHERE rn = 1;

Note: EXCLUDE syntax is Snowflake/DuckDB. In standard SQL use SELECT col1, col2, ... FROM ranked WHERE rn = 1.

Q13: Multi-step CTE transformation — calculate customer lifetime value tiers

SQL

WITH
order_totals AS (
    SELECT customer_id, SUM(revenue) AS total_revenue, COUNT(*) AS order_count
    FROM orders
    GROUP BY customer_id
),
with_metrics AS (
    SELECT
        customer_id,
        total_revenue,
        order_count,
        total_revenue / NULLIF(order_count, 0) AS avg_order_value
    FROM order_totals
),
with_tier AS (
    SELECT
        *,
        CASE
            WHEN total_revenue >= 10000 THEN 'platinum'
            WHEN total_revenue >=  2000 THEN 'gold'
            WHEN total_revenue >=   500 THEN 'silver'
            ELSE 'bronze'
        END AS ltv_tier
    FROM with_metrics
)
SELECT ltv_tier, COUNT(*) AS customer_count, AVG(total_revenue) AS avg_ltv
FROM with_tier
GROUP BY ltv_tier
ORDER BY avg_ltv DESC;

Q14: Recursive CTE — generate a date spine for a date range

SQL

WITH RECURSIVE date_spine AS (
    SELECT DATE '2025-01-01' AS dt

    UNION ALL

    SELECT dt + INTERVAL '1 day'
    FROM date_spine
    WHERE dt < DATE '2025-12-31'
)
-- Left join to your data to fill gaps
SELECT
    ds.dt,
    COALESCE(o.daily_revenue, 0) AS revenue
FROM date_spine ds
LEFT JOIN (
    SELECT DATE(order_date) AS dt, SUM(revenue) AS daily_revenue
    FROM orders
    GROUP BY 1
) o ON ds.dt = o.dt
ORDER BY ds.dt;

Why date spines matter in DE: Without a date spine, days with zero events are missing from your aggregations. Always left-join against a date spine for time-based reports.

Q15: Find the top product per category using a CTE + window function

SQL

WITH category_revenue AS (
    SELECT
        product_category,
        product_id,
        SUM(revenue) AS total_revenue,
        RANK() OVER (
            PARTITION BY product_category
            ORDER BY SUM(revenue) DESC
        ) AS cat_rank
    FROM order_items oi
    JOIN products p USING (product_id)
    GROUP BY product_category, product_id
)
SELECT product_category, product_id, total_revenue
FROM category_revenue
WHERE cat_rank = 1;

Part 3: Performance (10 Questions)

Q16: What does this execution plan tell you, and how do you fix it?

Question: You run EXPLAIN on a query and see Seq Scan on orders (cost=0.00..45000.00). What does this mean?

Answer: A sequential scan means PostgreSQL is reading every row in the orders table. This is fine for small tables or when you're selecting most rows, but expensive for selective queries. Fix:

SQL

-- Before: missing index
SELECT * FROM orders WHERE customer_id = 12345;

-- Fix: create a covering index
CREATE INDEX CONCURRENTLY idx_orders_customer_id ON orders(customer_id);

-- Better: if you frequently query by customer + date
CREATE INDEX CONCURRENTLY idx_orders_customer_date
ON orders(customer_id, order_date DESC)
INCLUDE (order_id, revenue);

The INCLUDE clause (PostgreSQL 11+) adds non-key columns to the index leaf pages, enabling index-only scans for common queries.

Q17: Why is this query slow, and how do you fix it?

SQL

-- Slow: function on indexed column prevents index use
SELECT * FROM orders
WHERE YEAR(order_date) = 2025
  AND MONTH(order_date) = 3;

-- Fast: use range predicate that allows index scan
SELECT * FROM orders
WHERE order_date >= '2025-03-01'
  AND order_date < '2025-04-01';

Rule: Never wrap an indexed column in a function in the WHERE clause. The optimizer cannot use the index because it would need to evaluate the function for every row.

Q18: Explain the difference between clustered and non-clustered indexes

Answer: A clustered index determines the physical order of rows on disk. There can be only one per table (usually the primary key). Data rows are stored in the index's leaf pages.

A non-clustered index is a separate structure with pointers back to the heap. Multiple per table are allowed.

For data engineers: In columnar stores (Snowflake, BigQuery, Redshift), the equivalent concept is clustering keys or sort keys. In Snowflake:

SQL

-- Create a table clustered on frequently-filtered columns
CREATE TABLE orders (
    order_id     INT,
    order_date   DATE,
    customer_id  INT,
    region       VARCHAR,
    revenue      DECIMAL(10,2)
) CLUSTER BY (order_date, region);

Clustering reduces micro-partition pruning scan percentage — directly reducing compute cost.

Q19: How would you optimise a query joining a 100M row table to a 500-row lookup table?

SQL

-- Naive: regular join (may cause broadcast or hash join)
SELECT o.*, s.status_label
FROM orders o
JOIN status_lookup s ON o.status_code = s.status_code;

-- Optimisation 1: if the lookup is truly small, ensure it's broadcast
-- (Spark/BigQuery/Snowflake handle this automatically for small tables)

-- Optimisation 2: if used repeatedly, materialise status labels
-- with a generated column or denormalise at load time
ALTER TABLE orders ADD COLUMN status_label VARCHAR;
UPDATE orders o
SET status_label = s.status_label
FROM status_lookup s
WHERE o.status_code = s.status_code;
-- Then no join needed at query time

Q20: Write a query that's optimised for Snowflake's micro-partition pruning

SQL

-- Bad: non-selective filter, no pruning benefit
SELECT * FROM events WHERE user_id = 12345;

-- Good: filter on cluster key first (order_date), then narrow down
-- Assumes table is clustered on order_date
SELECT * FROM orders
WHERE order_date BETWEEN '2025-01-01' AND '2025-03-31'
  AND customer_id = 12345;

Snowflake automatically prunes micro-partitions where the cluster key range doesn't overlap with your filter. Putting the cluster key filter first in the WHERE clause is a best practice for plan readability (though the optimizer handles order).

Part 4: Data Pipeline SQL (10 Questions)

Q21: Write an incremental load query using a high-watermark pattern

SQL

-- Find the high-watermark (last loaded timestamp)
WITH watermark AS (
    SELECT COALESCE(MAX(updated_at), '1970-01-01'::TIMESTAMP) AS last_loaded
    FROM pipeline_watermarks
    WHERE table_name = 'orders'
)
-- Load only new/changed rows
INSERT INTO orders_warehouse
SELECT s.*
FROM orders_source s
CROSS JOIN watermark w
WHERE s.updated_at > w.last_loaded;

-- Update the watermark
UPDATE pipeline_watermarks
SET last_loaded = CURRENT_TIMESTAMP
WHERE table_name = 'orders';

Q22: Implement SCD Type 2 MERGE for customer dimension

SQL

MERGE INTO dim_customer AS target
USING customer_staging AS source
ON target.customer_id = source.customer_id AND target.is_current = TRUE

-- Close the existing record when an attribute changes
WHEN MATCHED AND (
    target.email    <> source.email    OR
    target.address  <> source.address  OR
    target.tier     <> source.tier
) THEN UPDATE SET
    target.is_current  = FALSE,
    target.valid_to    = CURRENT_DATE - INTERVAL '1 day'

-- Insert the new version
WHEN NOT MATCHED BY TARGET OR (
    target.customer_id = source.customer_id AND target.is_current = TRUE AND (
        target.email    <> source.email    OR
        target.address  <> source.address
    )
) THEN INSERT (
    customer_id, email, address, tier,
    valid_from, valid_to, is_current
)
VALUES (
    source.customer_id, source.email, source.address, source.tier,
    CURRENT_DATE, '9999-12-31', TRUE
);

Q23: Deduplicate a CDC stream using ROW_NUMBER

SQL

-- CDC stream may have multiple versions of the same row
WITH deduped AS (
    SELECT
        *,
        ROW_NUMBER() OVER (
            PARTITION BY business_key
            ORDER BY change_timestamp DESC, operation DESC
            -- operation DESC: 'U' > 'I' if same timestamp (prefer updates)
        ) AS rn
    FROM cdc_raw_stream
    WHERE operation != 'D'  -- exclude deletes for this example
)
SELECT * EXCLUDE (rn)
FROM deduped
WHERE rn = 1;

Q24: Handle late-arriving data in a fact table

SQL

-- Option 1: MERGE to handle both new and late-arriving records
MERGE INTO fact_orders AS target
USING (
    SELECT order_id, revenue, status, event_timestamp
    FROM orders_staging
) AS source
ON target.order_id = source.order_id

WHEN MATCHED AND source.event_timestamp > target.event_timestamp
THEN UPDATE SET
    target.revenue         = source.revenue,
    target.status          = source.status,
    target.event_timestamp = source.event_timestamp,
    target.updated_at      = CURRENT_TIMESTAMP

WHEN NOT MATCHED
THEN INSERT (order_id, revenue, status, event_timestamp, inserted_at)
VALUES (source.order_id, source.revenue, source.status,
        source.event_timestamp, CURRENT_TIMESTAMP);

Q25: Find duplicate records across multiple columns efficiently

SQL

-- Find exact duplicates (all columns match)
WITH dup_check AS (
    SELECT
        customer_id,
        order_date,
        product_id,
        COUNT(*) AS dup_count
    FROM order_items
    GROUP BY customer_id, order_date, product_id
    HAVING COUNT(*) > 1
)
SELECT oi.*, dc.dup_count
FROM order_items oi
JOIN dup_check dc
  ON oi.customer_id = dc.customer_id
 AND oi.order_date  = dc.order_date
 AND oi.product_id  = dc.product_id
ORDER BY oi.customer_id, oi.order_date;

Part 5: Snowflake and BigQuery Specific (10 Questions)

Q26: Query a JSON column in Snowflake using VARIANT

SQL

-- Snowflake VARIANT column for semi-structured data
SELECT
    event_id,
    event_data:user_id::VARCHAR          AS user_id,
    event_data:properties:page::VARCHAR  AS page,
    event_data:properties:duration::INT  AS duration_seconds,
    event_data:tags[0]::VARCHAR          AS first_tag
FROM events
WHERE event_data:event_type::VARCHAR = 'page_view'
  AND event_data:properties:duration::INT > 30;

Q27: Use FLATTEN to unnest an array in Snowflake

SQL

-- Unnest a JSON array of items within each order
SELECT
    o.order_id,
    item.value:product_id::VARCHAR  AS product_id,
    item.value:quantity::INT        AS quantity,
    item.value:price::DECIMAL(10,2) AS unit_price
FROM orders o,
     LATERAL FLATTEN(input => o.order_data:items) AS item;

Equivalent in BigQuery using UNNEST:

SQL

SELECT
    o.order_id,
    item.product_id,
    item.quantity,
    item.price AS unit_price
FROM orders o,
     UNNEST(JSON_QUERY_ARRAY(o.order_data, '$.items')) AS item_raw
     -- parse fields from item_raw using JSON_VALUE()

Q28: Use QUALIFY to filter window function results (Snowflake)

SQL

-- QUALIFY is Snowflake's shorthand for filtering on window function results
-- Without QUALIFY (standard SQL):
WITH ranked AS (
    SELECT *, ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY order_date DESC) AS rn
    FROM orders
)
SELECT * FROM ranked WHERE rn = 1;

-- With QUALIFY (Snowflake only):
SELECT *
FROM orders
QUALIFY ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY order_date DESC) = 1;

Q29: Use Snowflake Time Travel to compare current and historical data

SQL

-- Query data as it was 2 days ago
SELECT * FROM orders AT (OFFSET => -2 * 24 * 60 * 60);

-- Compare today vs 7 days ago to detect data drift
SELECT
    'current' AS snapshot,
    COUNT(*)  AS row_count,
    AVG(revenue) AS avg_revenue
FROM orders

UNION ALL

SELECT
    '7_days_ago' AS snapshot,
    COUNT(*),
    AVG(revenue)
FROM orders AT (OFFSET => -7 * 24 * 60 * 60);

Q30: BigQuery partitioned table query with partition pruning

SQL

-- BigQuery: always filter on partition column for cost control
-- Bad: full table scan on a 10TB table
SELECT * FROM `project.dataset.events` WHERE user_id = 12345;

-- Good: partition prune first (event_date is the partition column)
SELECT * FROM `project.dataset.events`
WHERE event_date BETWEEN '2025-03-01' AND '2025-03-31'
  AND user_id = 12345;

-- Check bytes processed before running (dry run):
-- bq query --dry_run --use_legacy_sql=false 'SELECT ...'

Questions 31-50: Additional Topics

Q31-35: Complex Pipeline SQL Patterns

SQL

-- Q31: Running distinct count (approximation using HyperLogLog in Snowflake)
SELECT
    DATE_TRUNC('day', event_time) AS day,
    APPROX_COUNT_DISTINCT(user_id) AS approx_dau
FROM events
GROUP BY 1
ORDER BY 1;

-- Q32: Pivot revenue by quarter (conditional aggregation)
SELECT
    product_id,
    SUM(CASE WHEN quarter = 1 THEN revenue ELSE 0 END) AS q1,
    SUM(CASE WHEN quarter = 2 THEN revenue ELSE 0 END) AS q2,
    SUM(CASE WHEN quarter = 3 THEN revenue ELSE 0 END) AS q3,
    SUM(CASE WHEN quarter = 4 THEN revenue ELSE 0 END) AS q4
FROM (
    SELECT product_id, revenue, QUARTER(order_date) AS quarter
    FROM orders
    WHERE YEAR(order_date) = 2025
) sub
GROUP BY product_id;

-- Q33: Funnel analysis with window functions
WITH funnel AS (
    SELECT
        user_id,
        MAX(CASE WHEN event_type = 'view'     THEN 1 ELSE 0 END) AS viewed,
        MAX(CASE WHEN event_type = 'add_cart' THEN 1 ELSE 0 END) AS added_cart,
        MAX(CASE WHEN event_type = 'checkout' THEN 1 ELSE 0 END) AS checked_out,
        MAX(CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END) AS purchased
    FROM events
    WHERE session_date = '2025-05-01'
    GROUP BY user_id
)
SELECT
    SUM(viewed)      AS step1_view,
    SUM(added_cart)  AS step2_cart,
    SUM(checked_out) AS step3_checkout,
    SUM(purchased)   AS step4_purchase,
    ROUND(SUM(purchased) * 100.0 / NULLIF(SUM(viewed), 0), 2) AS view_to_purchase_pct
FROM funnel;

-- Q34: Find customers who bought product A but never product B
SELECT DISTINCT o1.customer_id
FROM order_items o1
WHERE o1.product_id = 'PROD_A'
  AND o1.customer_id NOT IN (
      SELECT customer_id FROM order_items WHERE product_id = 'PROD_B'
  );
-- Better with EXISTS:
SELECT DISTINCT customer_id
FROM order_items oi1
WHERE product_id = 'PROD_A'
  AND NOT EXISTS (
      SELECT 1 FROM order_items oi2
      WHERE oi2.customer_id = oi1.customer_id
        AND oi2.product_id = 'PROD_B'
  );

-- Q35: Calculate cohort retention (week 0, week 1, week 2...)
WITH cohorts AS (
    SELECT
        user_id,
        DATE_TRUNC('week', MIN(order_date)) AS cohort_week
    FROM orders
    GROUP BY user_id
),
activity AS (
    SELECT
        o.user_id,
        c.cohort_week,
        DATEDIFF('week', c.cohort_week, DATE_TRUNC('week', o.order_date)) AS weeks_since_first
    FROM orders o
    JOIN cohorts c USING (user_id)
)
SELECT
    cohort_week,
    weeks_since_first,
    COUNT(DISTINCT user_id) AS active_users
FROM activity
GROUP BY 1, 2
ORDER BY 1, 2;

Interview Tips: What Interviewers Look For

In window function questions: Do you know when to use ROWS vs RANGE? Do you specify frame clauses explicitly?

In CTE questions: Can you break a multi-step problem into readable, named intermediate results? Do you know when a CTE is better than a subquery (readability, reuse) and when it isn't (CTEs are materialised in some engines, which can hurt performance)?

In performance questions: Do you think about the query plan, not just correctness? Do you mention indexes, partitioning, and clustering?

In pipeline SQL questions: Do you know how to make operations idempotent? Can you handle late data without breaking downstream consumers?

In Snowflake/BigQuery questions: Do you know the cost model? Snowflake charges by compute time; BigQuery charges by bytes scanned. Every optimisation decision flows from knowing that.