LangChain Mastery · Lesson 25 of 33

Document Loaders: PDF, Web, CSV, and More

What is a Document?

A Document is LangChain's universal data container: text content plus metadata.

Python

from langchain_core.documents import Document

doc = Document(
    page_content="Warfarin inhibits vitamin K epoxide reductase (VKORC1).",
    metadata={
        "source": "clinical_pharmacology.pdf",
        "page": 42,
        "author": "Smith et al.",
        "date": "2026-01-15",
    },
)

print(doc.page_content)    # The text
print(doc.metadata)        # Dict of any metadata you want to attach

Every loader returns a list[Document]. The page_content goes to the LLM; metadata is used for filtering and citation.

PDF Loader

Python

from langchain_community.document_loaders import PyPDFLoader

# Single PDF — one Document per page
loader = PyPDFLoader("clinical_guidelines.pdf")
docs = loader.load()

print(f"Pages loaded: {len(docs)}")
print(f"Page 1 preview: {docs[0].page_content[:200]}")
print(f"Page 1 metadata: {docs[0].metadata}")
# {'source': 'clinical_guidelines.pdf', 'page': 0}


# Load multiple PDFs from a directory
from langchain_community.document_loaders import PyPDFDirectoryLoader

dir_loader = PyPDFDirectoryLoader("./drug_references/")
all_docs = dir_loader.load()
print(f"Total pages across all PDFs: {len(all_docs)}")


# Lazy loading — avoid loading everything into memory at once
loader = PyPDFLoader("large_pharmacopeia.pdf")
for doc in loader.lazy_load():
    process_page(doc)   # Process one page at a time

Metadata enrichment after loading:

Python

def load_pdf_with_metadata(filepath: str, category: str) -> list[Document]:
    """Load PDF and enrich metadata for downstream filtering."""
    loader = PyPDFLoader(filepath)
    docs = loader.load()
    
    filename = filepath.split("/")[-1].replace(".pdf", "")
    for i, doc in enumerate(docs):
        doc.metadata.update({
            "filename": filename,
            "category": category,
            "total_pages": len(docs),
            "char_count": len(doc.page_content),
        })
    
    # Filter out empty pages (common in scanned PDFs)
    return [d for d in docs if len(d.page_content.strip()) > 50]


guidelines = load_pdf_with_metadata("warfarin_guidelines.pdf", "anticoagulation")

Web Loader

Python

from langchain_community.document_loaders import WebBaseLoader
import bs4

# Load a single web page
loader = WebBaseLoader("https://example.com/drug-monograph")
docs = loader.load()
# Returns one Document with the page's text content


# Load multiple URLs
urls = [
    "https://example.com/warfarin",
    "https://example.com/metformin",
    "https://example.com/lisinopril",
]
loader = WebBaseLoader(urls)
docs = loader.load()
print(f"Pages loaded: {len(docs)}")


# Parse only specific HTML elements (avoid loading nav/footer/ads)
loader = WebBaseLoader(
    web_paths=["https://example.com/clinical-trial"],
    bs_kwargs={
        "parse_only": bs4.SoupStrainer(
            class_=("article-body", "main-content", "study-results")
        )
    },
)
docs = loader.load()


# Async loading for multiple pages (much faster)
import asyncio

async def load_pages_async(urls: list[str]) -> list[Document]:
    loader = WebBaseLoader(urls)
    loader.requests_per_second = 2   # Rate limit — be polite to servers
    docs = []
    async for doc in loader.alazy_load():
        docs.append(doc)
    return docs

docs = asyncio.run(load_pages_async(urls))

CSV and Structured Data Loaders

Python

from langchain_community.document_loaders import CSVLoader

# Each row becomes one Document
loader = CSVLoader(
    file_path="drug_interactions.csv",
    source_column="source",       # Use this column as the metadata source
    metadata_columns=["severity", "drug_class"],  # Include these as metadata
    csv_args={"delimiter": ","},
)
docs = loader.load()

# Each doc: page_content = "drug_a: warfarin\ndrug_b: aspirin\neffect: Major interaction..."
#           metadata = {"source": "lexicomp", "severity": "Major", "drug_class": "anticoagulant"}


# JSON Loader
from langchain_community.document_loaders import JSONLoader

loader = JSONLoader(
    file_path="clinical_trials.json",
    jq_schema=".trials[].description",  # jq expression to extract text
    text_content=True,
)
docs = loader.load()


# Pandas DataFrame loader (when data is already in memory)
from langchain_community.document_loaders import DataFrameLoader
import pandas as pd

df = pd.read_csv("patient_medications.csv")

loader = DataFrameLoader(
    data_frame=df,
    page_content_column="medication_notes",  # Column to use as page_content
)
docs = loader.load()
# metadata includes all other DataFrame columns automatically

Database Loader

Python

from langchain_community.document_loaders import SQLDatabaseLoader
from langchain_community.utilities import SQLDatabase

# Connect to a database
db = SQLDatabase.from_uri("sqlite:///clinical_data.db")

# Load query results as Documents
loader = SQLDatabaseLoader(
    query="SELECT drug_name, mechanism, standard_dose, indication FROM drug_formulary",
    db=db,
    page_content_mapper=lambda row: (
        f"Drug: {row['drug_name']}\n"
        f"Mechanism: {row['mechanism']}\n"
        f"Dose: {row['standard_dose']}\n"
        f"Indication: {row['indication']}"
    ),
)
docs = loader.load()

# Each row → one Document with structured text as page_content
print(docs[0].page_content)
# Drug: Warfarin
# Mechanism: VKORC1 inhibitor
# Dose: 2-10mg daily INR-guided
# Indication: Atrial fibrillation, DVT/PE

Custom Loader

When built-in loaders don't fit, subclass BaseLoader:

Python

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document
from typing import Iterator
import requests

class EHRLoader(BaseLoader):
    """Load patient medication records from an EHR API."""

    def __init__(self, patient_ids: list[str], api_url: str, api_key: str):
        self.patient_ids = patient_ids
        self.api_url = api_url
        self.headers = {"Authorization": f"Bearer {api_key}"}

    def lazy_load(self) -> Iterator[Document]:
        """Yield one Document per patient record."""
        for patient_id in self.patient_ids:
            try:
                response = requests.get(
                    f"{self.api_url}/patients/{patient_id}/medications",
                    headers=self.headers,
                    timeout=5,
                )
                response.raise_for_status()
                data = response.json()
                
                # Format medications as readable text
                med_text = "\n".join(
                    f"- {m['name']} {m['dose']} ({m['frequency']})"
                    for m in data.get("medications", [])
                )
                
                yield Document(
                    page_content=f"Patient {patient_id} medications:\n{med_text}",
                    metadata={
                        "patient_id": patient_id,
                        "source": "ehr_api",
                        "record_date": data.get("date"),
                        "medication_count": len(data.get("medications", [])),
                    },
                )
            except Exception as e:
                # Log and skip — don't fail the entire batch
                print(f"Failed to load patient {patient_id}: {e}")

    def load(self) -> list[Document]:
        return list(self.lazy_load())


# Use it just like any built-in loader
ehr_loader = EHRLoader(
    patient_ids=["P001", "P002", "P003"],
    api_url="https://ehr.hospital.internal",
    api_key="secret",
)
docs = ehr_loader.load()

Error-Resilient Ingestion Pipeline

Python

from pathlib import Path
from dataclasses import dataclass, field

@dataclass
class IngestionResult:
    loaded: list[Document] = field(default_factory=list)
    failed: list[str] = field(default_factory=list)
    skipped: list[str] = field(default_factory=list)

    def summary(self) -> str:
        return (
            f"Loaded: {len(self.loaded)} documents | "
            f"Failed: {len(self.failed)} files | "
            f"Skipped: {len(self.skipped)} files"
        )


def ingest_directory(directory: str, min_chars: int = 100) -> IngestionResult:
    """Load all supported files from a directory with error isolation."""
    result = IngestionResult()
    path = Path(directory)
    
    LOADERS = {
        ".pdf": PyPDFLoader,
        ".csv": lambda f: CSVLoader(file_path=f),
    }
    
    for file in path.iterdir():
        suffix = file.suffix.lower()
        if suffix not in LOADERS:
            result.skipped.append(str(file))
            continue
        
        try:
            loader = LOADERS[suffix](str(file))
            docs = loader.load()
            
            # Quality filter
            good_docs = [
                d for d in docs
                if len(d.page_content.strip()) >= min_chars
            ]
            
            # Add common metadata
            for doc in good_docs:
                doc.metadata["ingested_at"] = "2026-05-16"
                doc.metadata["file_size_bytes"] = file.stat().st_size
            
            result.loaded.extend(good_docs)
        
        except Exception as e:
            result.failed.append(f"{file.name}: {e}")
    
    return result


result = ingest_directory("./clinical_references/")
print(result.summary())
# Loaded: 847 documents | Failed: 2 files | Skipped: 5 files

Loader Comparison

| Loader | Source | Key Option | Returns | |---|---|---|---| | PyPDFLoader | Local PDF | extract_images=True | One doc per page | | PyPDFDirectoryLoader | PDF folder | glob="*.pdf" | All PDFs in dir | | WebBaseLoader | URL(s) | bs_kwargs for CSS filtering | One doc per URL | | CSVLoader | CSV file | source_column, metadata_columns | One doc per row | | JSONLoader | JSON file | jq_schema | Extracted text nodes | | DataFrameLoader | Pandas DataFrame | page_content_column | One doc per row | | SQLDatabaseLoader | SQL DB | query, page_content_mapper | One doc per row | | Custom BaseLoader | Anything | Implement lazy_load() | Anything you define |

Interview: Build a Research Agent with LangChain

Next Lesson

Text Splitters: Chunk Your Documents Right