PDF Parsing for RAG
Extract clean, structured text from PDFs for RAG ingestion. Handle tables, multi-column layouts, headers, footers, and scanned documents with OCR.
PDF Parsing Challenges
PDFs are not just text — they're a page layout format. Extracting clean text requires handling:
- Multi-column layouts: Text flows across columns, not top-to-bottom
- Tables: Tabular data needs structure preservation, not just extracted text
- Headers and footers: Page numbers, document titles that should be removed
- Embedded images: Text in images requires OCR
- Scanned documents: Entire pages are images — all text requires OCR
- Mathematical formulas: LaTeX or rendered math that doesn't extract cleanly
PyPDF2 and pypdf: Simple Text Extraction
import pypdf
from pathlib import Path
def extract_text_pypdf(pdf_path: str) -> dict:
"""Basic text extraction from a PDF. Works for simple text PDFs."""
text_by_page = {}
full_text = []
with open(pdf_path, "rb") as f:
reader = pypdf.PdfReader(f)
# Extract metadata
metadata = {
"n_pages": len(reader.pages),
"title": reader.metadata.get("/Title", Path(pdf_path).stem),
"author": reader.metadata.get("/Author", ""),
"subject": reader.metadata.get("/Subject", ""),
}
for page_num, page in enumerate(reader.pages):
text = page.extract_text()
if text and text.strip():
# Basic cleanup: normalize whitespace
text = " ".join(text.split())
text_by_page[page_num] = text
full_text.append(f"[Page {page_num + 1}]\n{text}")
return {
"metadata": metadata,
"full_text": "\n\n".join(full_text),
"pages": text_by_page,
}
def clean_extracted_text(text: str) -> str:
"""Clean common PDF extraction artifacts."""
import re
# Remove page numbers (standalone numbers on their own line)
text = re.sub(r'\n\s*\d+\s*\n', '\n', text)
# Remove excessive whitespace
text = re.sub(r'\s{3,}', ' ', text)
# Fix hyphenated line breaks (common in justified PDFs)
text = re.sub(r'(\w)-\n(\w)', r'\1\2', text)
# Normalize line endings
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()pdfplumber: Better Layout Handling
import pdfplumber
import pandas as pd
from typing import Optional
def extract_with_pdfplumber(
pdf_path: str,
extract_tables: bool = True,
) -> dict:
"""
Extract text and tables from PDF using pdfplumber.
Better than pypdf for complex layouts.
"""
pages_content = []
with pdfplumber.open(pdf_path) as pdf:
metadata = {
"n_pages": len(pdf.pages),
"metadata": pdf.metadata,
}
for page_num, page in enumerate(pdf.pages):
page_content = {"page": page_num + 1, "text": "", "tables": []}
if extract_tables:
# Extract tables first
tables = page.extract_tables()
for table in tables:
if table:
df = pd.DataFrame(table[1:], columns=table[0])
page_content["tables"].append(df.to_markdown(index=False))
# Extract text, excluding table regions
text = page.extract_text(x_tolerance=3, y_tolerance=3)
if text:
page_content["text"] = clean_extracted_text(text)
pages_content.append(page_content)
return {"metadata": metadata, "pages": pages_content}
def format_pdf_for_rag(pages_content: list[dict]) -> str:
"""
Convert pdfplumber output to clean text for RAG ingestion.
Tables are converted to markdown format for better LLM understanding.
"""
sections = []
for page in pages_content:
page_text = page["text"]
# Insert tables inline with text
if page["tables"]:
for table_md in page["tables"]:
page_text += f"\n\n{table_md}\n"
if page_text.strip():
sections.append(page_text)
return "\n\n".join(sections)Docling: Advanced PDF Understanding
Docling (IBM) provides state-of-the-art PDF parsing with layout understanding:
# pip install docling
from docling.document_converter import DocumentConverter
def extract_with_docling(pdf_path: str) -> dict:
"""
Extract PDF content using Docling — handles complex layouts,
tables, formulas, and multi-column text correctly.
"""
converter = DocumentConverter()
result = converter.convert(pdf_path)
# Docling returns structured markdown
markdown_text = result.document.export_to_markdown()
# Also available: structured elements
elements = []
for element, _ in result.document.iterate_items():
elements.append({
"type": type(element).__name__,
"text": element.text if hasattr(element, "text") else str(element),
})
return {
"markdown": markdown_text,
"elements": elements,
"n_pages": result.document.num_pages(),
}
# Docling advantages:
# - Layout-aware: correctly handles multi-column, figures, tables
# - Table extraction: converts tables to clean markdown
# - Formula recognition: detects math formulas
# - Reading order: correct top-to-bottom reading order for multi-column layoutsLlamaParse: Cloud-Based AI Parsing
LlamaIndex's cloud parsing service uses an LLM to parse complex PDFs:
# pip install llama-parse
from llama_parse import LlamaParse
def parse_with_llamaparse(
pdf_path: str,
api_key: str,
result_type: str = "markdown",
language: str = "en",
) -> str:
"""
Parse PDF using LlamaParse (cloud, AI-powered).
Best for complex medical/scientific/legal PDFs.
"""
parser = LlamaParse(
api_key=api_key,
result_type=result_type, # "markdown" or "text"
language=language,
verbose=True,
parsing_instruction="""This is a clinical pharmacology document.
Preserve table structure, drug names, dosing information, and interaction severity ratings.
Format tables as markdown tables.""",
)
# Parse returns a list of Document objects
documents = parser.load_data(pdf_path)
return "\n\n".join([doc.text for doc in documents])
# LlamaParse advantages:
# - Best-in-class accuracy for complex PDFs
# - Handles scanned PDFs (built-in OCR)
# - Customizable parsing instructions
# - Preserves table structure
# Cost: $3/1000 pages (as of 2025)OCR for Scanned PDFs
# pip install pytesseract pdf2image pillow
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import io
def extract_with_ocr(
pdf_path: str,
dpi: int = 300,
language: str = "eng",
) -> str:
"""Extract text from scanned PDFs using Tesseract OCR."""
# Convert PDF pages to images
images = convert_from_path(pdf_path, dpi=dpi)
all_text = []
for page_num, image in enumerate(images):
# Run Tesseract OCR
text = pytesseract.image_to_string(
image,
lang=language,
config='--psm 6 --oem 3', # Uniform block of text, LSTM+legacy OCR
)
if text.strip():
all_text.append(f"[Page {page_num + 1}]\n{clean_extracted_text(text)}")
return "\n\n".join(all_text)
def smart_pdf_loader(pdf_path: str) -> str:
"""
Automatically detect if PDF is text-based or scanned,
and use the appropriate extraction method.
"""
# Try standard extraction first
basic_result = extract_text_pypdf(pdf_path)
total_text = basic_result["full_text"]
# If very little text was extracted, it's likely scanned
avg_chars_per_page = len(total_text) / max(basic_result["metadata"]["n_pages"], 1)
if avg_chars_per_page < 100: # Fewer than 100 chars per page = probably scanned
print(f"PDF appears scanned ({avg_chars_per_page:.0f} chars/page). Using OCR.")
return extract_with_ocr(pdf_path)
else:
print(f"PDF is text-based ({avg_chars_per_page:.0f} chars/page). Using standard extraction.")
return total_textChunking PDFs by Section
Preserve document structure during chunking:
import re
def chunk_pdf_by_sections(
pdf_text: str,
document_id: str,
title: str,
max_chunk_size: int = 1000, # characters
) -> list[dict]:
"""
Chunk a PDF by detected section headers.
Falls back to fixed-size chunking when sections are too large.
"""
# Detect section headers (common clinical document patterns)
HEADER_PATTERNS = [
r'^[A-Z][A-Z\s]{5,50}$', # ALL CAPS headers
r'^\d+\.\s+[A-Z]', # "1. Introduction"
r'^(SECTION|CHAPTER|PART)\s+\d+', # Numbered sections
r'^(Introduction|Background|Methods|Results|Discussion|Conclusion)', # Academic
r'^(Dosing|Contraindications|Warnings|Precautions|Drug Interactions)', # Medical
]
lines = pdf_text.split('\n')
sections = []
current_section = {"header": title, "content": []}
for line in lines:
is_header = any(re.match(pattern, line.strip(), re.IGNORECASE) for pattern in HEADER_PATTERNS)
if is_header and len(line.strip()) > 3:
if current_section["content"]:
sections.append(current_section)
current_section = {"header": line.strip(), "content": []}
else:
current_section["content"].append(line)
if current_section["content"]:
sections.append(current_section)
# Convert sections to chunks
chunks = []
for section_idx, section in enumerate(sections):
section_text = "\n".join(section["content"]).strip()
if not section_text:
continue
# Split large sections into sub-chunks
if len(section_text) > max_chunk_size:
sub_chunks = [
section_text[i:i + max_chunk_size]
for i in range(0, len(section_text), max_chunk_size - 100)
]
for sub_idx, sub_chunk in enumerate(sub_chunks):
chunks.append({
"id": f"{document_id}_s{section_idx}_p{sub_idx}",
"content": f"{section['header']}\n\n{sub_chunk}",
"section": section["header"],
"document_id": document_id,
})
else:
chunks.append({
"id": f"{document_id}_s{section_idx}",
"content": f"{section['header']}\n\n{section_text}",
"section": section["header"],
"document_id": document_id,
})
return chunksComparison: PDF Parsing Options
| Tool | Text PDFs | Tables | Scanned | Cost | Speed | |---|---|---|---|---|---| | pypdf | Good | Poor | No | Free | Fast | | pdfplumber | Good | Good | No | Free | Fast | | Docling | Excellent | Excellent | Yes (basic) | Free | Medium | | LlamaParse | Excellent | Excellent | Yes | $3/1000 pages | Slow | | Tesseract (OCR) | N/A | Poor | Yes | Free | Slow | | Azure Document Intelligence | Excellent | Excellent | Yes | Pay-per-page | Medium |
Recommendation for clinical AI:
- Standard drug information PDFs (Lexicomp, FDA labels): pdfplumber or Docling
- Scanned legacy documents: Tesseract or LlamaParse
- Complex research papers with tables and formulas: LlamaParse or Azure DI
- High volume (thousands of PDFs): Docling (free, good accuracy, batch processing)
Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.