Back to blog
Data Engineeringintermediate

SQL Real-World Project: E-Commerce Analytics Database

Build a complete e-commerce analytics system from scratch. Design the schema, load data, write complex reporting queries, and create a dashboard data layer — exactly as done in production.

LearnixoApril 13, 202611 min read
View Source
SQLDatabaseProjectAnalyticsE-CommercePostgreSQL
Share:𝕏

Project Overview

We're building the analytics backend for ShopFlow — a mid-sized e-commerce company with 50,000+ customers, 10,000+ products, and 500,000+ orders.

The analytics team needs:

  1. A well-designed schema that supports complex reporting
  2. Real-time KPI dashboards (revenue, conversion, retention)
  3. Customer segmentation for marketing campaigns
  4. Inventory and product performance reports
  5. Fraud detection queries

This is exactly the kind of work a data engineer or senior backend developer does in their first month at a product company.


Phase 1: Schema Design

Good schema design makes queries fast and maintainable. Bad design leads to unmaintainable queries and slow reports.

SQL
-- ============================================================
-- CORE TABLES
-- ============================================================

CREATE TABLE customers (
  customer_id    SERIAL PRIMARY KEY,
  email          VARCHAR(255) UNIQUE NOT NULL,
  first_name     VARCHAR(100),
  last_name      VARCHAR(100),
  phone          VARCHAR(20),
  date_of_birth  DATE,
  gender         CHAR(1),           -- 'M', 'F', 'O'
  signup_date    TIMESTAMP NOT NULL DEFAULT NOW(),
  last_login     TIMESTAMP,
  is_active      BOOLEAN NOT NULL DEFAULT TRUE,
  tier           VARCHAR(20) NOT NULL DEFAULT 'free'
                   CHECK (tier IN ('free', 'pro', 'enterprise'))
);

CREATE TABLE addresses (
  address_id     SERIAL PRIMARY KEY,
  customer_id    INT NOT NULL REFERENCES customers(customer_id),
  type           VARCHAR(20) NOT NULL,  -- 'billing', 'shipping'
  street         VARCHAR(255),
  city           VARCHAR(100),
  state          VARCHAR(100),
  country        CHAR(2) NOT NULL,      -- ISO country code
  postcode       VARCHAR(20),
  is_default     BOOLEAN DEFAULT FALSE
);

CREATE TABLE categories (
  category_id    SERIAL PRIMARY KEY,
  name           VARCHAR(100) NOT NULL,
  parent_id      INT REFERENCES categories(category_id),  -- hierarchical
  slug           VARCHAR(100) UNIQUE
);

CREATE TABLE products (
  product_id     SERIAL PRIMARY KEY,
  sku            VARCHAR(100) UNIQUE NOT NULL,
  name           VARCHAR(255) NOT NULL,
  description    TEXT,
  category_id    INT REFERENCES categories(category_id),
  brand          VARCHAR(100),
  cost_price     DECIMAL(10,2),         -- what we paid for it
  list_price     DECIMAL(10,2) NOT NULL, -- displayed price
  weight_kg      DECIMAL(8,3),
  is_active      BOOLEAN DEFAULT TRUE,
  created_at     TIMESTAMP DEFAULT NOW()
);

CREATE TABLE inventory (
  inventory_id   SERIAL PRIMARY KEY,
  product_id     INT NOT NULL REFERENCES products(product_id),
  warehouse      VARCHAR(50),
  quantity       INT NOT NULL DEFAULT 0,
  reorder_level  INT DEFAULT 10,
  last_updated   TIMESTAMP DEFAULT NOW()
);

CREATE TABLE orders (
  order_id       SERIAL PRIMARY KEY,
  customer_id    INT NOT NULL REFERENCES customers(customer_id),
  order_number   VARCHAR(50) UNIQUE NOT NULL,  -- human-readable e.g. ORD-2025-001234
  status         VARCHAR(20) NOT NULL DEFAULT 'pending'
                   CHECK (status IN ('pending','processing','shipped','delivered','cancelled','refunded')),
  payment_method VARCHAR(50),
  shipping_address_id INT REFERENCES addresses(address_id),
  subtotal       DECIMAL(10,2) NOT NULL,
  discount_amount DECIMAL(10,2) DEFAULT 0,
  shipping_cost  DECIMAL(10,2) DEFAULT 0,
  tax_amount     DECIMAL(10,2) DEFAULT 0,
  total_amount   DECIMAL(10,2) NOT NULL,
  currency       CHAR(3) DEFAULT 'GBP',
  notes          TEXT,
  created_at     TIMESTAMP NOT NULL DEFAULT NOW(),
  updated_at     TIMESTAMP DEFAULT NOW()
);

CREATE TABLE order_items (
  item_id        SERIAL PRIMARY KEY,
  order_id       INT NOT NULL REFERENCES orders(order_id),
  product_id     INT NOT NULL REFERENCES products(product_id),
  quantity       INT NOT NULL,
  unit_price     DECIMAL(10,2) NOT NULL,   -- price at time of purchase
  discount_pct   DECIMAL(5,2) DEFAULT 0,  -- percentage discount on this item
  line_total     DECIMAL(10,2) NOT NULL
);

CREATE TABLE coupons (
  coupon_id      SERIAL PRIMARY KEY,
  code           VARCHAR(50) UNIQUE NOT NULL,
  discount_type  VARCHAR(20) NOT NULL,  -- 'percentage', 'fixed'
  discount_value DECIMAL(10,2) NOT NULL,
  min_order_value DECIMAL(10,2) DEFAULT 0,
  max_uses       INT,
  times_used     INT DEFAULT 0,
  valid_from     DATE,
  valid_until    DATE,
  is_active      BOOLEAN DEFAULT TRUE
);

CREATE TABLE order_coupons (
  order_id       INT NOT NULL REFERENCES orders(order_id),
  coupon_id      INT NOT NULL REFERENCES coupons(coupon_id),
  discount_applied DECIMAL(10,2),
  PRIMARY KEY (order_id, coupon_id)
);

-- ============================================================
-- EVENT / ANALYTICS TABLES
-- ============================================================

CREATE TABLE page_views (
  view_id        BIGSERIAL PRIMARY KEY,
  session_id     VARCHAR(100),
  customer_id    INT REFERENCES customers(customer_id),  -- NULL if anonymous
  page_url       TEXT,
  product_id     INT REFERENCES products(product_id),    -- NULL if not a product page
  viewed_at      TIMESTAMP NOT NULL DEFAULT NOW(),
  time_on_page   INT,    -- seconds
  referrer       TEXT,
  device_type    VARCHAR(20)  -- 'desktop', 'mobile', 'tablet'
);

CREATE TABLE cart_events (
  event_id       BIGSERIAL PRIMARY KEY,
  session_id     VARCHAR(100),
  customer_id    INT REFERENCES customers(customer_id),
  product_id     INT NOT NULL REFERENCES products(product_id),
  action         VARCHAR(20) NOT NULL, -- 'add', 'remove', 'view_cart'
  quantity       INT,
  event_at       TIMESTAMP NOT NULL DEFAULT NOW()
);

Indexes for performance

SQL
-- Orders: most common query patterns
CREATE INDEX idx_orders_customer    ON orders(customer_id);
CREATE INDEX idx_orders_status      ON orders(status);
CREATE INDEX idx_orders_created     ON orders(created_at);
CREATE INDEX idx_orders_status_date ON orders(status, created_at);

-- Order items: JOIN performance
CREATE INDEX idx_items_order   ON order_items(order_id);
CREATE INDEX idx_items_product ON order_items(product_id);

-- Page views: analytics queries
CREATE INDEX idx_pageviews_customer ON page_views(customer_id);
CREATE INDEX idx_pageviews_product  ON page_views(product_id);
CREATE INDEX idx_pageviews_time     ON page_views(viewed_at);

-- Cart events
CREATE INDEX idx_cart_customer ON cart_events(customer_id);
CREATE INDEX idx_cart_product  ON cart_events(product_id);

Phase 2: Sample Data Setup

SQL
-- Insert sample categories (hierarchical)
INSERT INTO categories (name, parent_id, slug) VALUES
  ('Electronics', NULL, 'electronics'),
  ('Clothing',    NULL, 'clothing'),
  ('Books',       NULL, 'books'),
  ('Laptops',     1,    'laptops'),
  ('Phones',      1,    'phones'),
  ('T-Shirts',    2,    'tshirts'),
  ('Jackets',     2,    'jackets');

-- Generate 1000 test customers (PostgreSQL)
INSERT INTO customers (email, first_name, last_name, signup_date, tier)
SELECT
  'user' || i || '@example.com',
  (ARRAY['Alice','Bob','Carol','David','Eve','Frank'])[1 + (i % 6)],
  (ARRAY['Smith','Jones','Patel','Kim','Müller','García'])[1 + (i % 6)],
  NOW() - (RANDOM() * INTERVAL '730 days'),
  (ARRAY['free','free','free','pro','pro','enterprise'])[1 + (i % 6)]
FROM generate_series(1, 1000) AS i;

Phase 3: KPI Dashboard Queries

These are the queries that power the analytics dashboard on the home page.

Revenue summary

SQL
CREATE OR REPLACE VIEW v_revenue_summary AS
WITH periods AS (
  SELECT
    SUM(CASE WHEN created_at >= CURRENT_DATE                           THEN total_amount END) AS today,
    SUM(CASE WHEN created_at >= CURRENT_DATE - 7                       THEN total_amount END) AS last_7_days,
    SUM(CASE WHEN created_at >= DATE_TRUNC('month', CURRENT_DATE)      THEN total_amount END) AS mtd,
    SUM(CASE WHEN created_at >= DATE_TRUNC('year', CURRENT_DATE)       THEN total_amount END) AS ytd,
    SUM(CASE WHEN created_at >= DATE_TRUNC('month', CURRENT_DATE)
              AND created_at <  DATE_TRUNC('month', CURRENT_DATE) + INTERVAL '1 month'
                                                                        THEN total_amount END) AS this_month,
    SUM(CASE WHEN created_at >= DATE_TRUNC('month', CURRENT_DATE) - INTERVAL '1 month'
              AND created_at <  DATE_TRUNC('month', CURRENT_DATE)       THEN total_amount END) AS last_month
  FROM orders
  WHERE status NOT IN ('cancelled', 'refunded')
)
SELECT
  today,
  last_7_days,
  mtd,
  ytd,
  this_month,
  last_month,
  ROUND(100.0 * (this_month - last_month) / NULLIF(last_month, 0), 1) AS mom_growth_pct
FROM periods;

Daily revenue chart (last 30 days)

SQL
WITH date_spine AS (
  SELECT generate_series(
    CURRENT_DATE - 29,
    CURRENT_DATE,
    '1 day'::INTERVAL
  )::DATE AS d
),
daily_orders AS (
  SELECT
    created_at::DATE AS order_date,
    COUNT(*)         AS order_count,
    SUM(total_amount) AS revenue
  FROM orders
  WHERE status NOT IN ('cancelled', 'refunded')
    AND created_at >= CURRENT_DATE - 29
  GROUP BY 1
)
SELECT
  ds.d          AS date,
  COALESCE(do.order_count, 0) AS orders,
  COALESCE(do.revenue, 0)     AS revenue
FROM date_spine ds
LEFT JOIN daily_orders do ON ds.d = do.order_date
ORDER BY ds.d;

Conversion funnel

SQL
WITH funnel AS (
  SELECT
    'Sessions'     AS stage, COUNT(DISTINCT session_id) AS users FROM page_views
  UNION ALL
  SELECT
    'Product Views', COUNT(DISTINCT session_id)
  FROM page_views WHERE product_id IS NOT NULL
  UNION ALL
  SELECT
    'Add to Cart',   COUNT(DISTINCT session_id)
  FROM cart_events WHERE action = 'add'
  UNION ALL
  SELECT
    'Checkout',      COUNT(DISTINCT o.customer_id)
  FROM orders o WHERE created_at >= CURRENT_DATE - 30
),
ordered AS (
  SELECT *, ROW_NUMBER() OVER () AS step FROM funnel
)
SELECT
  step,
  stage,
  users,
  ROUND(100.0 * users / FIRST_VALUE(users) OVER (ORDER BY step), 1) AS pct_of_top
FROM ordered;

Phase 4: Customer Segmentation (RFM Analysis)

RFM = Recency (when did they last buy?), Frequency (how often?), Monetary (how much did they spend?).

SQL
WITH rfm_base AS (
  SELECT
    customer_id,
    CURRENT_DATE - MAX(created_at)::DATE AS recency_days,
    COUNT(DISTINCT order_id)              AS frequency,
    SUM(total_amount)                     AS monetary
  FROM orders
  WHERE status NOT IN ('cancelled', 'refunded')
  GROUP BY customer_id
),
rfm_scores AS (
  SELECT
    customer_id,
    recency_days,
    frequency,
    monetary,
    NTILE(5) OVER (ORDER BY recency_days ASC)  AS r_score,   -- lower days = higher score
    NTILE(5) OVER (ORDER BY frequency ASC)     AS f_score,
    NTILE(5) OVER (ORDER BY monetary ASC)      AS m_score
  FROM rfm_base
),
rfm_segments AS (
  SELECT *,
    CASE
      WHEN r_score >= 4 AND f_score >= 4 AND m_score >= 4 THEN 'Champions'
      WHEN r_score >= 4 AND f_score >= 2                   THEN 'Loyal Customers'
      WHEN r_score >= 4 AND f_score = 1                    THEN 'Recent Customers'
      WHEN r_score = 3  AND f_score >= 3                   THEN 'Potential Loyalists'
      WHEN r_score = 2  AND f_score >= 4                   THEN 'At Risk'
      WHEN r_score <= 2 AND f_score <= 2                   THEN 'Hibernating'
      ELSE 'Needs Attention'
    END AS segment
  FROM rfm_scores
)
SELECT
  segment,
  COUNT(*)             AS customers,
  ROUND(AVG(monetary)) AS avg_spend,
  ROUND(AVG(frequency), 1) AS avg_orders,
  ROUND(AVG(recency_days)) AS avg_days_since_order
FROM rfm_segments
GROUP BY segment
ORDER BY avg_spend DESC;

Phase 5: Product Analytics

SQL
-- Top products by revenue with category breakdown
WITH product_revenue AS (
  SELECT
    p.product_id,
    p.name          AS product_name,
    c.name          AS category,
    p.list_price,
    SUM(oi.quantity)          AS units_sold,
    SUM(oi.line_total)        AS total_revenue,
    COUNT(DISTINCT oi.order_id) AS orders_containing,
    -- Gross margin
    ROUND(100.0 * (p.list_price - COALESCE(p.cost_price, 0)) / p.list_price, 1) AS margin_pct
  FROM products p
  JOIN categories c     ON p.category_id = c.category_id
  JOIN order_items oi   ON p.product_id = oi.product_id
  JOIN orders o         ON oi.order_id = o.order_id
  WHERE o.status NOT IN ('cancelled', 'refunded')
  GROUP BY p.product_id, p.name, c.name, p.list_price, p.cost_price
)
SELECT
  product_name,
  category,
  list_price,
  units_sold,
  total_revenue,
  margin_pct,
  RANK() OVER (ORDER BY total_revenue DESC)        AS revenue_rank,
  RANK() OVER (PARTITION BY category ORDER BY total_revenue DESC) AS rank_in_category
FROM product_revenue
ORDER BY total_revenue DESC
LIMIT 50;

-- Products frequently bought together (market basket analysis)
SELECT
  p1.name AS product_1,
  p2.name AS product_2,
  COUNT(*) AS times_bought_together,
  ROUND(100.0 * COUNT(*) / (
    SELECT COUNT(DISTINCT order_id) FROM orders
    WHERE status NOT IN ('cancelled', 'refunded')
  ), 2) AS support_pct
FROM order_items oi1
JOIN order_items oi2
  ON oi1.order_id = oi2.order_id
  AND oi1.product_id < oi2.product_id  -- avoid duplicates
JOIN products p1 ON oi1.product_id = p1.product_id
JOIN products p2 ON oi2.product_id = p2.product_id
GROUP BY p1.product_id, p1.name, p2.product_id, p2.name
HAVING COUNT(*) >= 10
ORDER BY times_bought_together DESC
LIMIT 20;

Phase 6: Inventory Management

SQL
-- Low stock alerts with sales velocity
WITH sales_30d AS (
  SELECT
    oi.product_id,
    SUM(oi.quantity) AS units_sold_30d
  FROM order_items oi
  JOIN orders o ON oi.order_id = o.order_id
  WHERE o.created_at >= CURRENT_DATE - 30
    AND o.status NOT IN ('cancelled', 'refunded')
  GROUP BY oi.product_id
)
SELECT
  p.sku,
  p.name,
  i.warehouse,
  i.quantity        AS stock,
  i.reorder_level,
  COALESCE(s.units_sold_30d, 0) AS sold_last_30d,
  -- Days of stock remaining
  CASE
    WHEN COALESCE(s.units_sold_30d, 0) = 0 THEN NULL
    ELSE ROUND(i.quantity / (s.units_sold_30d / 30.0))
  END AS days_of_stock,
  CASE
    WHEN i.quantity <= i.reorder_level THEN 'REORDER NOW'
    WHEN i.quantity <= i.reorder_level * 2 THEN 'Low Stock'
    ELSE 'OK'
  END AS stock_status
FROM inventory i
JOIN products p ON i.product_id = p.product_id
LEFT JOIN sales_30d s ON i.product_id = s.product_id
WHERE p.is_active = TRUE
ORDER BY
  CASE WHEN i.quantity <= i.reorder_level THEN 0 ELSE 1 END,
  days_of_stock ASC NULLS LAST;

Phase 7: Fraud Detection

SQL
-- Orders flagged for potential fraud
WITH order_flags AS (
  SELECT
    o.order_id,
    o.customer_id,
    o.total_amount,
    o.created_at,
    o.payment_method,
    -- Flag: multiple orders from same customer in 1 hour
    COUNT(*) OVER (
      PARTITION BY o.customer_id
      ORDER BY o.created_at
      RANGE BETWEEN INTERVAL '1 hour' PRECEDING AND CURRENT ROW
    ) AS orders_last_hour,
    -- Flag: order significantly above customer average
    o.total_amount / NULLIF(AVG(o.total_amount) OVER (PARTITION BY o.customer_id), 0) AS multiple_of_avg,
    -- Flag: first order from new account
    o.created_at - c.signup_date AS account_age_at_order
  FROM orders o
  JOIN customers c ON o.customer_id = c.customer_id
  WHERE o.status NOT IN ('cancelled', 'refunded')
)
SELECT
  order_id,
  customer_id,
  total_amount,
  payment_method,
  orders_last_hour,
  ROUND(multiple_of_avg, 1) AS multiple_of_avg,
  EXTRACT(EPOCH FROM account_age_at_order) / 3600 AS account_age_hours,
  -- Aggregate risk score
  (
    CASE WHEN orders_last_hour > 3           THEN 30 ELSE 0 END +
    CASE WHEN multiple_of_avg > 5            THEN 30 ELSE 0 END +
    CASE WHEN account_age_at_order < INTERVAL '1 hour' THEN 40 ELSE 0 END
  ) AS risk_score
FROM order_flags
WHERE (
  orders_last_hour > 3 OR
  multiple_of_avg > 5 OR
  account_age_at_order < INTERVAL '1 hour'
)
ORDER BY risk_score DESC;

Phase 8: Views and Stored Procedures

SQL
-- Materialized view for slow dashboard queries
CREATE MATERIALIZED VIEW mv_customer_lifetime AS
SELECT
  c.customer_id,
  c.email,
  c.tier,
  c.signup_date,
  COUNT(DISTINCT o.order_id)  AS total_orders,
  SUM(o.total_amount)         AS lifetime_value,
  MAX(o.created_at)           AS last_order_date,
  MIN(o.created_at)           AS first_order_date,
  AVG(o.total_amount)         AS avg_order_value
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
  AND o.status NOT IN ('cancelled', 'refunded')
GROUP BY c.customer_id, c.email, c.tier, c.signup_date;

-- Refresh nightly
CREATE UNIQUE INDEX ON mv_customer_lifetime(customer_id);
-- Schedule: REFRESH MATERIALIZED VIEW CONCURRENTLY mv_customer_lifetime;

-- Stored procedure: process an order
CREATE OR REPLACE PROCEDURE process_order(
  p_order_id    INT,
  p_new_status  VARCHAR
)
LANGUAGE plpgsql AS $$
BEGIN
  -- Update order status
  UPDATE orders
  SET status = p_new_status, updated_at = NOW()
  WHERE order_id = p_order_id;

  -- If delivered, deduct from inventory
  IF p_new_status = 'delivered' THEN
    UPDATE inventory i
    SET quantity = i.quantity - oi.quantity,
        last_updated = NOW()
    FROM order_items oi
    WHERE oi.order_id = p_order_id
      AND i.product_id = oi.product_id;
  END IF;

  COMMIT;
END;
$$;

Running the Project

Bash
# Start PostgreSQL locally
docker run -d \
  --name shopflow-db \
  -e POSTGRES_DB=shopflow \
  -e POSTGRES_USER=shopflow \
  -e POSTGRES_PASSWORD=shopflow \
  -p 5432:5432 \
  postgres:16

# Connect
psql -h localhost -U shopflow -d shopflow

# Run schema
psql -h localhost -U shopflow -d shopflow -f schema.sql
psql -h localhost -U shopflow -d shopflow -f seed.sql

# Run analytics
psql -h localhost -U shopflow -d shopflow -f queries/rfm_analysis.sql

What This Project Teaches

| Skill | Where used | |-------|-----------| | Schema design | Phase 1 | | Constraints and referential integrity | Phase 1 | | Index strategy | Phase 1 | | Aggregations and GROUP BY | Phase 3 | | Window functions | Phases 3, 4, 7 | | CTEs | All phases | | Recursive CTEs | Categories hierarchy | | CASE expressions | Phases 4, 6, 7 | | Materialized views | Phase 8 | | Stored procedures | Phase 8 | | Self-joins | Phase 5 | | Performance tuning | Throughout |

The full source code includes a Python script to generate 500,000 realistic test rows and a Metabase dashboard configuration.

Enjoyed this article?

Explore the Data Engineering learning path for more.

Found this helpful?

Share:𝕏

Leave a comment

Have a question, correction, or just found this helpful? Leave a note below.