SQL & Database Mastery · Lesson 3 of 6
Project: E-Commerce Analytics
Project Overview
We're building the analytics backend for ShopFlow — a mid-sized e-commerce company with 50,000+ customers, 10,000+ products, and 500,000+ orders.
The analytics team needs:
- A well-designed schema that supports complex reporting
- Real-time KPI dashboards (revenue, conversion, retention)
- Customer segmentation for marketing campaigns
- Inventory and product performance reports
- Fraud detection queries
This is exactly the kind of work a data engineer or senior backend developer does in their first month at a product company.
Phase 1: Schema Design
Good schema design makes queries fast and maintainable. Bad design leads to unmaintainable queries and slow reports.
-- ============================================================
-- CORE TABLES
-- ============================================================
CREATE TABLE customers (
customer_id SERIAL PRIMARY KEY,
email VARCHAR(255) UNIQUE NOT NULL,
first_name VARCHAR(100),
last_name VARCHAR(100),
phone VARCHAR(20),
date_of_birth DATE,
gender CHAR(1), -- 'M', 'F', 'O'
signup_date TIMESTAMP NOT NULL DEFAULT NOW(),
last_login TIMESTAMP,
is_active BOOLEAN NOT NULL DEFAULT TRUE,
tier VARCHAR(20) NOT NULL DEFAULT 'free'
CHECK (tier IN ('free', 'pro', 'enterprise'))
);
CREATE TABLE addresses (
address_id SERIAL PRIMARY KEY,
customer_id INT NOT NULL REFERENCES customers(customer_id),
type VARCHAR(20) NOT NULL, -- 'billing', 'shipping'
street VARCHAR(255),
city VARCHAR(100),
state VARCHAR(100),
country CHAR(2) NOT NULL, -- ISO country code
postcode VARCHAR(20),
is_default BOOLEAN DEFAULT FALSE
);
CREATE TABLE categories (
category_id SERIAL PRIMARY KEY,
name VARCHAR(100) NOT NULL,
parent_id INT REFERENCES categories(category_id), -- hierarchical
slug VARCHAR(100) UNIQUE
);
CREATE TABLE products (
product_id SERIAL PRIMARY KEY,
sku VARCHAR(100) UNIQUE NOT NULL,
name VARCHAR(255) NOT NULL,
description TEXT,
category_id INT REFERENCES categories(category_id),
brand VARCHAR(100),
cost_price DECIMAL(10,2), -- what we paid for it
list_price DECIMAL(10,2) NOT NULL, -- displayed price
weight_kg DECIMAL(8,3),
is_active BOOLEAN DEFAULT TRUE,
created_at TIMESTAMP DEFAULT NOW()
);
CREATE TABLE inventory (
inventory_id SERIAL PRIMARY KEY,
product_id INT NOT NULL REFERENCES products(product_id),
warehouse VARCHAR(50),
quantity INT NOT NULL DEFAULT 0,
reorder_level INT DEFAULT 10,
last_updated TIMESTAMP DEFAULT NOW()
);
CREATE TABLE orders (
order_id SERIAL PRIMARY KEY,
customer_id INT NOT NULL REFERENCES customers(customer_id),
order_number VARCHAR(50) UNIQUE NOT NULL, -- human-readable e.g. ORD-2025-001234
status VARCHAR(20) NOT NULL DEFAULT 'pending'
CHECK (status IN ('pending','processing','shipped','delivered','cancelled','refunded')),
payment_method VARCHAR(50),
shipping_address_id INT REFERENCES addresses(address_id),
subtotal DECIMAL(10,2) NOT NULL,
discount_amount DECIMAL(10,2) DEFAULT 0,
shipping_cost DECIMAL(10,2) DEFAULT 0,
tax_amount DECIMAL(10,2) DEFAULT 0,
total_amount DECIMAL(10,2) NOT NULL,
currency CHAR(3) DEFAULT 'GBP',
notes TEXT,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP DEFAULT NOW()
);
CREATE TABLE order_items (
item_id SERIAL PRIMARY KEY,
order_id INT NOT NULL REFERENCES orders(order_id),
product_id INT NOT NULL REFERENCES products(product_id),
quantity INT NOT NULL,
unit_price DECIMAL(10,2) NOT NULL, -- price at time of purchase
discount_pct DECIMAL(5,2) DEFAULT 0, -- percentage discount on this item
line_total DECIMAL(10,2) NOT NULL
);
CREATE TABLE coupons (
coupon_id SERIAL PRIMARY KEY,
code VARCHAR(50) UNIQUE NOT NULL,
discount_type VARCHAR(20) NOT NULL, -- 'percentage', 'fixed'
discount_value DECIMAL(10,2) NOT NULL,
min_order_value DECIMAL(10,2) DEFAULT 0,
max_uses INT,
times_used INT DEFAULT 0,
valid_from DATE,
valid_until DATE,
is_active BOOLEAN DEFAULT TRUE
);
CREATE TABLE order_coupons (
order_id INT NOT NULL REFERENCES orders(order_id),
coupon_id INT NOT NULL REFERENCES coupons(coupon_id),
discount_applied DECIMAL(10,2),
PRIMARY KEY (order_id, coupon_id)
);
-- ============================================================
-- EVENT / ANALYTICS TABLES
-- ============================================================
CREATE TABLE page_views (
view_id BIGSERIAL PRIMARY KEY,
session_id VARCHAR(100),
customer_id INT REFERENCES customers(customer_id), -- NULL if anonymous
page_url TEXT,
product_id INT REFERENCES products(product_id), -- NULL if not a product page
viewed_at TIMESTAMP NOT NULL DEFAULT NOW(),
time_on_page INT, -- seconds
referrer TEXT,
device_type VARCHAR(20) -- 'desktop', 'mobile', 'tablet'
);
CREATE TABLE cart_events (
event_id BIGSERIAL PRIMARY KEY,
session_id VARCHAR(100),
customer_id INT REFERENCES customers(customer_id),
product_id INT NOT NULL REFERENCES products(product_id),
action VARCHAR(20) NOT NULL, -- 'add', 'remove', 'view_cart'
quantity INT,
event_at TIMESTAMP NOT NULL DEFAULT NOW()
);Indexes for performance
-- Orders: most common query patterns
CREATE INDEX idx_orders_customer ON orders(customer_id);
CREATE INDEX idx_orders_status ON orders(status);
CREATE INDEX idx_orders_created ON orders(created_at);
CREATE INDEX idx_orders_status_date ON orders(status, created_at);
-- Order items: JOIN performance
CREATE INDEX idx_items_order ON order_items(order_id);
CREATE INDEX idx_items_product ON order_items(product_id);
-- Page views: analytics queries
CREATE INDEX idx_pageviews_customer ON page_views(customer_id);
CREATE INDEX idx_pageviews_product ON page_views(product_id);
CREATE INDEX idx_pageviews_time ON page_views(viewed_at);
-- Cart events
CREATE INDEX idx_cart_customer ON cart_events(customer_id);
CREATE INDEX idx_cart_product ON cart_events(product_id);Phase 2: Sample Data Setup
-- Insert sample categories (hierarchical)
INSERT INTO categories (name, parent_id, slug) VALUES
('Electronics', NULL, 'electronics'),
('Clothing', NULL, 'clothing'),
('Books', NULL, 'books'),
('Laptops', 1, 'laptops'),
('Phones', 1, 'phones'),
('T-Shirts', 2, 'tshirts'),
('Jackets', 2, 'jackets');
-- Generate 1000 test customers (PostgreSQL)
INSERT INTO customers (email, first_name, last_name, signup_date, tier)
SELECT
'user' || i || '@example.com',
(ARRAY['Alice','Bob','Carol','David','Eve','Frank'])[1 + (i % 6)],
(ARRAY['Smith','Jones','Patel','Kim','Müller','García'])[1 + (i % 6)],
NOW() - (RANDOM() * INTERVAL '730 days'),
(ARRAY['free','free','free','pro','pro','enterprise'])[1 + (i % 6)]
FROM generate_series(1, 1000) AS i;Phase 3: KPI Dashboard Queries
These are the queries that power the analytics dashboard on the home page.
Revenue summary
CREATE OR REPLACE VIEW v_revenue_summary AS
WITH periods AS (
SELECT
SUM(CASE WHEN created_at >= CURRENT_DATE THEN total_amount END) AS today,
SUM(CASE WHEN created_at >= CURRENT_DATE - 7 THEN total_amount END) AS last_7_days,
SUM(CASE WHEN created_at >= DATE_TRUNC('month', CURRENT_DATE) THEN total_amount END) AS mtd,
SUM(CASE WHEN created_at >= DATE_TRUNC('year', CURRENT_DATE) THEN total_amount END) AS ytd,
SUM(CASE WHEN created_at >= DATE_TRUNC('month', CURRENT_DATE)
AND created_at < DATE_TRUNC('month', CURRENT_DATE) + INTERVAL '1 month'
THEN total_amount END) AS this_month,
SUM(CASE WHEN created_at >= DATE_TRUNC('month', CURRENT_DATE) - INTERVAL '1 month'
AND created_at < DATE_TRUNC('month', CURRENT_DATE) THEN total_amount END) AS last_month
FROM orders
WHERE status NOT IN ('cancelled', 'refunded')
)
SELECT
today,
last_7_days,
mtd,
ytd,
this_month,
last_month,
ROUND(100.0 * (this_month - last_month) / NULLIF(last_month, 0), 1) AS mom_growth_pct
FROM periods;Daily revenue chart (last 30 days)
WITH date_spine AS (
SELECT generate_series(
CURRENT_DATE - 29,
CURRENT_DATE,
'1 day'::INTERVAL
)::DATE AS d
),
daily_orders AS (
SELECT
created_at::DATE AS order_date,
COUNT(*) AS order_count,
SUM(total_amount) AS revenue
FROM orders
WHERE status NOT IN ('cancelled', 'refunded')
AND created_at >= CURRENT_DATE - 29
GROUP BY 1
)
SELECT
ds.d AS date,
COALESCE(do.order_count, 0) AS orders,
COALESCE(do.revenue, 0) AS revenue
FROM date_spine ds
LEFT JOIN daily_orders do ON ds.d = do.order_date
ORDER BY ds.d;Conversion funnel
WITH funnel AS (
SELECT
'Sessions' AS stage, COUNT(DISTINCT session_id) AS users FROM page_views
UNION ALL
SELECT
'Product Views', COUNT(DISTINCT session_id)
FROM page_views WHERE product_id IS NOT NULL
UNION ALL
SELECT
'Add to Cart', COUNT(DISTINCT session_id)
FROM cart_events WHERE action = 'add'
UNION ALL
SELECT
'Checkout', COUNT(DISTINCT o.customer_id)
FROM orders o WHERE created_at >= CURRENT_DATE - 30
),
ordered AS (
SELECT *, ROW_NUMBER() OVER () AS step FROM funnel
)
SELECT
step,
stage,
users,
ROUND(100.0 * users / FIRST_VALUE(users) OVER (ORDER BY step), 1) AS pct_of_top
FROM ordered;Phase 4: Customer Segmentation (RFM Analysis)
RFM = Recency (when did they last buy?), Frequency (how often?), Monetary (how much did they spend?).
WITH rfm_base AS (
SELECT
customer_id,
CURRENT_DATE - MAX(created_at)::DATE AS recency_days,
COUNT(DISTINCT order_id) AS frequency,
SUM(total_amount) AS monetary
FROM orders
WHERE status NOT IN ('cancelled', 'refunded')
GROUP BY customer_id
),
rfm_scores AS (
SELECT
customer_id,
recency_days,
frequency,
monetary,
NTILE(5) OVER (ORDER BY recency_days ASC) AS r_score, -- lower days = higher score
NTILE(5) OVER (ORDER BY frequency ASC) AS f_score,
NTILE(5) OVER (ORDER BY monetary ASC) AS m_score
FROM rfm_base
),
rfm_segments AS (
SELECT *,
CASE
WHEN r_score >= 4 AND f_score >= 4 AND m_score >= 4 THEN 'Champions'
WHEN r_score >= 4 AND f_score >= 2 THEN 'Loyal Customers'
WHEN r_score >= 4 AND f_score = 1 THEN 'Recent Customers'
WHEN r_score = 3 AND f_score >= 3 THEN 'Potential Loyalists'
WHEN r_score = 2 AND f_score >= 4 THEN 'At Risk'
WHEN r_score <= 2 AND f_score <= 2 THEN 'Hibernating'
ELSE 'Needs Attention'
END AS segment
FROM rfm_scores
)
SELECT
segment,
COUNT(*) AS customers,
ROUND(AVG(monetary)) AS avg_spend,
ROUND(AVG(frequency), 1) AS avg_orders,
ROUND(AVG(recency_days)) AS avg_days_since_order
FROM rfm_segments
GROUP BY segment
ORDER BY avg_spend DESC;Phase 5: Product Analytics
-- Top products by revenue with category breakdown
WITH product_revenue AS (
SELECT
p.product_id,
p.name AS product_name,
c.name AS category,
p.list_price,
SUM(oi.quantity) AS units_sold,
SUM(oi.line_total) AS total_revenue,
COUNT(DISTINCT oi.order_id) AS orders_containing,
-- Gross margin
ROUND(100.0 * (p.list_price - COALESCE(p.cost_price, 0)) / p.list_price, 1) AS margin_pct
FROM products p
JOIN categories c ON p.category_id = c.category_id
JOIN order_items oi ON p.product_id = oi.product_id
JOIN orders o ON oi.order_id = o.order_id
WHERE o.status NOT IN ('cancelled', 'refunded')
GROUP BY p.product_id, p.name, c.name, p.list_price, p.cost_price
)
SELECT
product_name,
category,
list_price,
units_sold,
total_revenue,
margin_pct,
RANK() OVER (ORDER BY total_revenue DESC) AS revenue_rank,
RANK() OVER (PARTITION BY category ORDER BY total_revenue DESC) AS rank_in_category
FROM product_revenue
ORDER BY total_revenue DESC
LIMIT 50;
-- Products frequently bought together (market basket analysis)
SELECT
p1.name AS product_1,
p2.name AS product_2,
COUNT(*) AS times_bought_together,
ROUND(100.0 * COUNT(*) / (
SELECT COUNT(DISTINCT order_id) FROM orders
WHERE status NOT IN ('cancelled', 'refunded')
), 2) AS support_pct
FROM order_items oi1
JOIN order_items oi2
ON oi1.order_id = oi2.order_id
AND oi1.product_id < oi2.product_id -- avoid duplicates
JOIN products p1 ON oi1.product_id = p1.product_id
JOIN products p2 ON oi2.product_id = p2.product_id
GROUP BY p1.product_id, p1.name, p2.product_id, p2.name
HAVING COUNT(*) >= 10
ORDER BY times_bought_together DESC
LIMIT 20;Phase 6: Inventory Management
-- Low stock alerts with sales velocity
WITH sales_30d AS (
SELECT
oi.product_id,
SUM(oi.quantity) AS units_sold_30d
FROM order_items oi
JOIN orders o ON oi.order_id = o.order_id
WHERE o.created_at >= CURRENT_DATE - 30
AND o.status NOT IN ('cancelled', 'refunded')
GROUP BY oi.product_id
)
SELECT
p.sku,
p.name,
i.warehouse,
i.quantity AS stock,
i.reorder_level,
COALESCE(s.units_sold_30d, 0) AS sold_last_30d,
-- Days of stock remaining
CASE
WHEN COALESCE(s.units_sold_30d, 0) = 0 THEN NULL
ELSE ROUND(i.quantity / (s.units_sold_30d / 30.0))
END AS days_of_stock,
CASE
WHEN i.quantity <= i.reorder_level THEN 'REORDER NOW'
WHEN i.quantity <= i.reorder_level * 2 THEN 'Low Stock'
ELSE 'OK'
END AS stock_status
FROM inventory i
JOIN products p ON i.product_id = p.product_id
LEFT JOIN sales_30d s ON i.product_id = s.product_id
WHERE p.is_active = TRUE
ORDER BY
CASE WHEN i.quantity <= i.reorder_level THEN 0 ELSE 1 END,
days_of_stock ASC NULLS LAST;Phase 7: Fraud Detection
-- Orders flagged for potential fraud
WITH order_flags AS (
SELECT
o.order_id,
o.customer_id,
o.total_amount,
o.created_at,
o.payment_method,
-- Flag: multiple orders from same customer in 1 hour
COUNT(*) OVER (
PARTITION BY o.customer_id
ORDER BY o.created_at
RANGE BETWEEN INTERVAL '1 hour' PRECEDING AND CURRENT ROW
) AS orders_last_hour,
-- Flag: order significantly above customer average
o.total_amount / NULLIF(AVG(o.total_amount) OVER (PARTITION BY o.customer_id), 0) AS multiple_of_avg,
-- Flag: first order from new account
o.created_at - c.signup_date AS account_age_at_order
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
WHERE o.status NOT IN ('cancelled', 'refunded')
)
SELECT
order_id,
customer_id,
total_amount,
payment_method,
orders_last_hour,
ROUND(multiple_of_avg, 1) AS multiple_of_avg,
EXTRACT(EPOCH FROM account_age_at_order) / 3600 AS account_age_hours,
-- Aggregate risk score
(
CASE WHEN orders_last_hour > 3 THEN 30 ELSE 0 END +
CASE WHEN multiple_of_avg > 5 THEN 30 ELSE 0 END +
CASE WHEN account_age_at_order < INTERVAL '1 hour' THEN 40 ELSE 0 END
) AS risk_score
FROM order_flags
WHERE (
orders_last_hour > 3 OR
multiple_of_avg > 5 OR
account_age_at_order < INTERVAL '1 hour'
)
ORDER BY risk_score DESC;Phase 8: Views and Stored Procedures
-- Materialized view for slow dashboard queries
CREATE MATERIALIZED VIEW mv_customer_lifetime AS
SELECT
c.customer_id,
c.email,
c.tier,
c.signup_date,
COUNT(DISTINCT o.order_id) AS total_orders,
SUM(o.total_amount) AS lifetime_value,
MAX(o.created_at) AS last_order_date,
MIN(o.created_at) AS first_order_date,
AVG(o.total_amount) AS avg_order_value
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
AND o.status NOT IN ('cancelled', 'refunded')
GROUP BY c.customer_id, c.email, c.tier, c.signup_date;
-- Refresh nightly
CREATE UNIQUE INDEX ON mv_customer_lifetime(customer_id);
-- Schedule: REFRESH MATERIALIZED VIEW CONCURRENTLY mv_customer_lifetime;
-- Stored procedure: process an order
CREATE OR REPLACE PROCEDURE process_order(
p_order_id INT,
p_new_status VARCHAR
)
LANGUAGE plpgsql AS $$
BEGIN
-- Update order status
UPDATE orders
SET status = p_new_status, updated_at = NOW()
WHERE order_id = p_order_id;
-- If delivered, deduct from inventory
IF p_new_status = 'delivered' THEN
UPDATE inventory i
SET quantity = i.quantity - oi.quantity,
last_updated = NOW()
FROM order_items oi
WHERE oi.order_id = p_order_id
AND i.product_id = oi.product_id;
END IF;
COMMIT;
END;
$$;Running the Project
# Start PostgreSQL locally
docker run -d \
--name shopflow-db \
-e POSTGRES_DB=shopflow \
-e POSTGRES_USER=shopflow \
-e POSTGRES_PASSWORD=shopflow \
-p 5432:5432 \
postgres:16
# Connect
psql -h localhost -U shopflow -d shopflow
# Run schema
psql -h localhost -U shopflow -d shopflow -f schema.sql
psql -h localhost -U shopflow -d shopflow -f seed.sql
# Run analytics
psql -h localhost -U shopflow -d shopflow -f queries/rfm_analysis.sqlWhat This Project Teaches
| Skill | Where used | |-------|-----------| | Schema design | Phase 1 | | Constraints and referential integrity | Phase 1 | | Index strategy | Phase 1 | | Aggregations and GROUP BY | Phase 3 | | Window functions | Phases 3, 4, 7 | | CTEs | All phases | | Recursive CTEs | Categories hierarchy | | CASE expressions | Phases 4, 6, 7 | | Materialized views | Phase 8 | | Stored procedures | Phase 8 | | Self-joins | Phase 5 | | Performance tuning | Throughout |
The full source code includes a Python script to generate 500,000 realistic test rows and a Metabase dashboard configuration.