PDF Parsing for RAG

PDF Parsing Challenges

PDFs are not just text — they're a page layout format. Extracting clean text requires handling:

Multi-column layouts: Text flows across columns, not top-to-bottom
Tables: Tabular data needs structure preservation, not just extracted text
Headers and footers: Page numbers, document titles that should be removed
Embedded images: Text in images requires OCR
Scanned documents: Entire pages are images — all text requires OCR
Mathematical formulas: LaTeX or rendered math that doesn't extract cleanly

PyPDF2 and pypdf: Simple Text Extraction

Python

import pypdf
from pathlib import Path

def extract_text_pypdf(pdf_path: str) -> dict:
    """Basic text extraction from a PDF. Works for simple text PDFs."""
    text_by_page = {}
    full_text = []

    with open(pdf_path, "rb") as f:
        reader = pypdf.PdfReader(f)

        # Extract metadata
        metadata = {
            "n_pages": len(reader.pages),
            "title": reader.metadata.get("/Title", Path(pdf_path).stem),
            "author": reader.metadata.get("/Author", ""),
            "subject": reader.metadata.get("/Subject", ""),
        }

        for page_num, page in enumerate(reader.pages):
            text = page.extract_text()
            if text and text.strip():
                # Basic cleanup: normalize whitespace
                text = " ".join(text.split())
                text_by_page[page_num] = text
                full_text.append(f"[Page {page_num + 1}]\n{text}")

    return {
        "metadata": metadata,
        "full_text": "\n\n".join(full_text),
        "pages": text_by_page,
    }


def clean_extracted_text(text: str) -> str:
    """Clean common PDF extraction artifacts."""
    import re

    # Remove page numbers (standalone numbers on their own line)
    text = re.sub(r'\n\s*\d+\s*\n', '\n', text)

    # Remove excessive whitespace
    text = re.sub(r'\s{3,}', '  ', text)

    # Fix hyphenated line breaks (common in justified PDFs)
    text = re.sub(r'(\w)-\n(\w)', r'\1\2', text)

    # Normalize line endings
    text = re.sub(r'\n{3,}', '\n\n', text)

    return text.strip()

pdfplumber: Better Layout Handling

Python

import pdfplumber
import pandas as pd
from typing import Optional

def extract_with_pdfplumber(
    pdf_path: str,
    extract_tables: bool = True,
) -> dict:
    """
    Extract text and tables from PDF using pdfplumber.
    Better than pypdf for complex layouts.
    """
    pages_content = []

    with pdfplumber.open(pdf_path) as pdf:
        metadata = {
            "n_pages": len(pdf.pages),
            "metadata": pdf.metadata,
        }

        for page_num, page in enumerate(pdf.pages):
            page_content = {"page": page_num + 1, "text": "", "tables": []}

            if extract_tables:
                # Extract tables first
                tables = page.extract_tables()
                for table in tables:
                    if table:
                        df = pd.DataFrame(table[1:], columns=table[0])
                        page_content["tables"].append(df.to_markdown(index=False))

            # Extract text, excluding table regions
            text = page.extract_text(x_tolerance=3, y_tolerance=3)
            if text:
                page_content["text"] = clean_extracted_text(text)

            pages_content.append(page_content)

    return {"metadata": metadata, "pages": pages_content}


def format_pdf_for_rag(pages_content: list[dict]) -> str:
    """
    Convert pdfplumber output to clean text for RAG ingestion.
    Tables are converted to markdown format for better LLM understanding.
    """
    sections = []

    for page in pages_content:
        page_text = page["text"]

        # Insert tables inline with text
        if page["tables"]:
            for table_md in page["tables"]:
                page_text += f"\n\n{table_md}\n"

        if page_text.strip():
            sections.append(page_text)

    return "\n\n".join(sections)

Docling: Advanced PDF Understanding

Docling (IBM) provides state-of-the-art PDF parsing with layout understanding:

Python

# pip install docling
from docling.document_converter import DocumentConverter

def extract_with_docling(pdf_path: str) -> dict:
    """
    Extract PDF content using Docling — handles complex layouts,
    tables, formulas, and multi-column text correctly.
    """
    converter = DocumentConverter()
    result = converter.convert(pdf_path)

    # Docling returns structured markdown
    markdown_text = result.document.export_to_markdown()

    # Also available: structured elements
    elements = []
    for element, _ in result.document.iterate_items():
        elements.append({
            "type": type(element).__name__,
            "text": element.text if hasattr(element, "text") else str(element),
        })

    return {
        "markdown": markdown_text,
        "elements": elements,
        "n_pages": result.document.num_pages(),
    }


# Docling advantages:
# - Layout-aware: correctly handles multi-column, figures, tables
# - Table extraction: converts tables to clean markdown
# - Formula recognition: detects math formulas
# - Reading order: correct top-to-bottom reading order for multi-column layouts

LlamaParse: Cloud-Based AI Parsing

LlamaIndex's cloud parsing service uses an LLM to parse complex PDFs:

Python

# pip install llama-parse
from llama_parse import LlamaParse

def parse_with_llamaparse(
    pdf_path: str,
    api_key: str,
    result_type: str = "markdown",
    language: str = "en",
) -> str:
    """
    Parse PDF using LlamaParse (cloud, AI-powered).
    Best for complex medical/scientific/legal PDFs.
    """
    parser = LlamaParse(
        api_key=api_key,
        result_type=result_type,       # "markdown" or "text"
        language=language,
        verbose=True,
        parsing_instruction="""This is a clinical pharmacology document.
        Preserve table structure, drug names, dosing information, and interaction severity ratings.
        Format tables as markdown tables.""",
    )

    # Parse returns a list of Document objects
    documents = parser.load_data(pdf_path)

    return "\n\n".join([doc.text for doc in documents])


# LlamaParse advantages:
# - Best-in-class accuracy for complex PDFs
# - Handles scanned PDFs (built-in OCR)
# - Customizable parsing instructions
# - Preserves table structure
# Cost: $3/1000 pages (as of 2025)

OCR for Scanned PDFs

Python

# pip install pytesseract pdf2image pillow
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import io

def extract_with_ocr(
    pdf_path: str,
    dpi: int = 300,
    language: str = "eng",
) -> str:
    """Extract text from scanned PDFs using Tesseract OCR."""
    # Convert PDF pages to images
    images = convert_from_path(pdf_path, dpi=dpi)

    all_text = []
    for page_num, image in enumerate(images):
        # Run Tesseract OCR
        text = pytesseract.image_to_string(
            image,
            lang=language,
            config='--psm 6 --oem 3',  # Uniform block of text, LSTM+legacy OCR
        )
        if text.strip():
            all_text.append(f"[Page {page_num + 1}]\n{clean_extracted_text(text)}")

    return "\n\n".join(all_text)


def smart_pdf_loader(pdf_path: str) -> str:
    """
    Automatically detect if PDF is text-based or scanned,
    and use the appropriate extraction method.
    """
    # Try standard extraction first
    basic_result = extract_text_pypdf(pdf_path)
    total_text = basic_result["full_text"]

    # If very little text was extracted, it's likely scanned
    avg_chars_per_page = len(total_text) / max(basic_result["metadata"]["n_pages"], 1)

    if avg_chars_per_page < 100:  # Fewer than 100 chars per page = probably scanned
        print(f"PDF appears scanned ({avg_chars_per_page:.0f} chars/page). Using OCR.")
        return extract_with_ocr(pdf_path)
    else:
        print(f"PDF is text-based ({avg_chars_per_page:.0f} chars/page). Using standard extraction.")
        return total_text

Chunking PDFs by Section

Preserve document structure during chunking:

Python

import re

def chunk_pdf_by_sections(
    pdf_text: str,
    document_id: str,
    title: str,
    max_chunk_size: int = 1000,  # characters
) -> list[dict]:
    """
    Chunk a PDF by detected section headers.
    Falls back to fixed-size chunking when sections are too large.
    """
    # Detect section headers (common clinical document patterns)
    HEADER_PATTERNS = [
        r'^[A-Z][A-Z\s]{5,50}$',                    # ALL CAPS headers
        r'^\d+\.\s+[A-Z]',                          # "1. Introduction"
        r'^(SECTION|CHAPTER|PART)\s+\d+',           # Numbered sections
        r'^(Introduction|Background|Methods|Results|Discussion|Conclusion)',  # Academic
        r'^(Dosing|Contraindications|Warnings|Precautions|Drug Interactions)',  # Medical
    ]

    lines = pdf_text.split('\n')
    sections = []
    current_section = {"header": title, "content": []}

    for line in lines:
        is_header = any(re.match(pattern, line.strip(), re.IGNORECASE) for pattern in HEADER_PATTERNS)

        if is_header and len(line.strip()) > 3:
            if current_section["content"]:
                sections.append(current_section)
            current_section = {"header": line.strip(), "content": []}
        else:
            current_section["content"].append(line)

    if current_section["content"]:
        sections.append(current_section)

    # Convert sections to chunks
    chunks = []
    for section_idx, section in enumerate(sections):
        section_text = "\n".join(section["content"]).strip()
        if not section_text:
            continue

        # Split large sections into sub-chunks
        if len(section_text) > max_chunk_size:
            sub_chunks = [
                section_text[i:i + max_chunk_size]
                for i in range(0, len(section_text), max_chunk_size - 100)
            ]
            for sub_idx, sub_chunk in enumerate(sub_chunks):
                chunks.append({
                    "id": f"{document_id}_s{section_idx}_p{sub_idx}",
                    "content": f"{section['header']}\n\n{sub_chunk}",
                    "section": section["header"],
                    "document_id": document_id,
                })
        else:
            chunks.append({
                "id": f"{document_id}_s{section_idx}",
                "content": f"{section['header']}\n\n{section_text}",
                "section": section["header"],
                "document_id": document_id,
            })

    return chunks

Comparison: PDF Parsing Options

| Tool | Text PDFs | Tables | Scanned | Cost | Speed | |---|---|---|---|---|---| | pypdf | Good | Poor | No | Free | Fast | | pdfplumber | Good | Good | No | Free | Fast | | Docling | Excellent | Excellent | Yes (basic) | Free | Medium | | LlamaParse | Excellent | Excellent | Yes | $3/1000 pages | Slow | | Tesseract (OCR) | N/A | Poor | Yes | Free | Slow | | Azure Document Intelligence | Excellent | Excellent | Yes | Pay-per-page | Medium |

Recommendation for clinical AI:

Standard drug information PDFs (Lexicomp, FDA labels): pdfplumber or Docling
Scanned legacy documents: Tesseract or LlamaParse
Complex research papers with tables and formulas: LlamaParse or Azure DI
High volume (thousands of PDFs): Docling (free, good accuracy, batch processing)

PDF Parsing for RAG

PDF Parsing Challenges

PyPDF2 and pypdf: Simple Text Extraction

pdfplumber: Better Layout Handling

Docling: Advanced PDF Understanding

LlamaParse: Cloud-Based AI Parsing

OCR for Scanned PDFs

Chunking PDFs by Section

Comparison: PDF Parsing Options

Enjoyed this article?

Leave a comment