LangChain Mastery · Lesson 25 of 33
Document Loaders: PDF, Web, CSV, and More
What is a Document?
A Document is LangChain's universal data container: text content plus metadata.
from langchain_core.documents import Document
doc = Document(
page_content="Warfarin inhibits vitamin K epoxide reductase (VKORC1).",
metadata={
"source": "clinical_pharmacology.pdf",
"page": 42,
"author": "Smith et al.",
"date": "2026-01-15",
},
)
print(doc.page_content) # The text
print(doc.metadata) # Dict of any metadata you want to attachEvery loader returns a list[Document]. The page_content goes to the LLM; metadata is used for filtering and citation.
PDF Loader
from langchain_community.document_loaders import PyPDFLoader
# Single PDF — one Document per page
loader = PyPDFLoader("clinical_guidelines.pdf")
docs = loader.load()
print(f"Pages loaded: {len(docs)}")
print(f"Page 1 preview: {docs[0].page_content[:200]}")
print(f"Page 1 metadata: {docs[0].metadata}")
# {'source': 'clinical_guidelines.pdf', 'page': 0}
# Load multiple PDFs from a directory
from langchain_community.document_loaders import PyPDFDirectoryLoader
dir_loader = PyPDFDirectoryLoader("./drug_references/")
all_docs = dir_loader.load()
print(f"Total pages across all PDFs: {len(all_docs)}")
# Lazy loading — avoid loading everything into memory at once
loader = PyPDFLoader("large_pharmacopeia.pdf")
for doc in loader.lazy_load():
process_page(doc) # Process one page at a timeMetadata enrichment after loading:
def load_pdf_with_metadata(filepath: str, category: str) -> list[Document]:
"""Load PDF and enrich metadata for downstream filtering."""
loader = PyPDFLoader(filepath)
docs = loader.load()
filename = filepath.split("/")[-1].replace(".pdf", "")
for i, doc in enumerate(docs):
doc.metadata.update({
"filename": filename,
"category": category,
"total_pages": len(docs),
"char_count": len(doc.page_content),
})
# Filter out empty pages (common in scanned PDFs)
return [d for d in docs if len(d.page_content.strip()) > 50]
guidelines = load_pdf_with_metadata("warfarin_guidelines.pdf", "anticoagulation")Web Loader
from langchain_community.document_loaders import WebBaseLoader
import bs4
# Load a single web page
loader = WebBaseLoader("https://example.com/drug-monograph")
docs = loader.load()
# Returns one Document with the page's text content
# Load multiple URLs
urls = [
"https://example.com/warfarin",
"https://example.com/metformin",
"https://example.com/lisinopril",
]
loader = WebBaseLoader(urls)
docs = loader.load()
print(f"Pages loaded: {len(docs)}")
# Parse only specific HTML elements (avoid loading nav/footer/ads)
loader = WebBaseLoader(
web_paths=["https://example.com/clinical-trial"],
bs_kwargs={
"parse_only": bs4.SoupStrainer(
class_=("article-body", "main-content", "study-results")
)
},
)
docs = loader.load()
# Async loading for multiple pages (much faster)
import asyncio
async def load_pages_async(urls: list[str]) -> list[Document]:
loader = WebBaseLoader(urls)
loader.requests_per_second = 2 # Rate limit — be polite to servers
docs = []
async for doc in loader.alazy_load():
docs.append(doc)
return docs
docs = asyncio.run(load_pages_async(urls))CSV and Structured Data Loaders
from langchain_community.document_loaders import CSVLoader
# Each row becomes one Document
loader = CSVLoader(
file_path="drug_interactions.csv",
source_column="source", # Use this column as the metadata source
metadata_columns=["severity", "drug_class"], # Include these as metadata
csv_args={"delimiter": ","},
)
docs = loader.load()
# Each doc: page_content = "drug_a: warfarin\ndrug_b: aspirin\neffect: Major interaction..."
# metadata = {"source": "lexicomp", "severity": "Major", "drug_class": "anticoagulant"}
# JSON Loader
from langchain_community.document_loaders import JSONLoader
loader = JSONLoader(
file_path="clinical_trials.json",
jq_schema=".trials[].description", # jq expression to extract text
text_content=True,
)
docs = loader.load()
# Pandas DataFrame loader (when data is already in memory)
from langchain_community.document_loaders import DataFrameLoader
import pandas as pd
df = pd.read_csv("patient_medications.csv")
loader = DataFrameLoader(
data_frame=df,
page_content_column="medication_notes", # Column to use as page_content
)
docs = loader.load()
# metadata includes all other DataFrame columns automaticallyDatabase Loader
from langchain_community.document_loaders import SQLDatabaseLoader
from langchain_community.utilities import SQLDatabase
# Connect to a database
db = SQLDatabase.from_uri("sqlite:///clinical_data.db")
# Load query results as Documents
loader = SQLDatabaseLoader(
query="SELECT drug_name, mechanism, standard_dose, indication FROM drug_formulary",
db=db,
page_content_mapper=lambda row: (
f"Drug: {row['drug_name']}\n"
f"Mechanism: {row['mechanism']}\n"
f"Dose: {row['standard_dose']}\n"
f"Indication: {row['indication']}"
),
)
docs = loader.load()
# Each row → one Document with structured text as page_content
print(docs[0].page_content)
# Drug: Warfarin
# Mechanism: VKORC1 inhibitor
# Dose: 2-10mg daily INR-guided
# Indication: Atrial fibrillation, DVT/PECustom Loader
When built-in loaders don't fit, subclass BaseLoader:
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document
from typing import Iterator
import requests
class EHRLoader(BaseLoader):
"""Load patient medication records from an EHR API."""
def __init__(self, patient_ids: list[str], api_url: str, api_key: str):
self.patient_ids = patient_ids
self.api_url = api_url
self.headers = {"Authorization": f"Bearer {api_key}"}
def lazy_load(self) -> Iterator[Document]:
"""Yield one Document per patient record."""
for patient_id in self.patient_ids:
try:
response = requests.get(
f"{self.api_url}/patients/{patient_id}/medications",
headers=self.headers,
timeout=5,
)
response.raise_for_status()
data = response.json()
# Format medications as readable text
med_text = "\n".join(
f"- {m['name']} {m['dose']} ({m['frequency']})"
for m in data.get("medications", [])
)
yield Document(
page_content=f"Patient {patient_id} medications:\n{med_text}",
metadata={
"patient_id": patient_id,
"source": "ehr_api",
"record_date": data.get("date"),
"medication_count": len(data.get("medications", [])),
},
)
except Exception as e:
# Log and skip — don't fail the entire batch
print(f"Failed to load patient {patient_id}: {e}")
def load(self) -> list[Document]:
return list(self.lazy_load())
# Use it just like any built-in loader
ehr_loader = EHRLoader(
patient_ids=["P001", "P002", "P003"],
api_url="https://ehr.hospital.internal",
api_key="secret",
)
docs = ehr_loader.load()Error-Resilient Ingestion Pipeline
from pathlib import Path
from dataclasses import dataclass, field
@dataclass
class IngestionResult:
loaded: list[Document] = field(default_factory=list)
failed: list[str] = field(default_factory=list)
skipped: list[str] = field(default_factory=list)
def summary(self) -> str:
return (
f"Loaded: {len(self.loaded)} documents | "
f"Failed: {len(self.failed)} files | "
f"Skipped: {len(self.skipped)} files"
)
def ingest_directory(directory: str, min_chars: int = 100) -> IngestionResult:
"""Load all supported files from a directory with error isolation."""
result = IngestionResult()
path = Path(directory)
LOADERS = {
".pdf": PyPDFLoader,
".csv": lambda f: CSVLoader(file_path=f),
}
for file in path.iterdir():
suffix = file.suffix.lower()
if suffix not in LOADERS:
result.skipped.append(str(file))
continue
try:
loader = LOADERS[suffix](str(file))
docs = loader.load()
# Quality filter
good_docs = [
d for d in docs
if len(d.page_content.strip()) >= min_chars
]
# Add common metadata
for doc in good_docs:
doc.metadata["ingested_at"] = "2026-05-16"
doc.metadata["file_size_bytes"] = file.stat().st_size
result.loaded.extend(good_docs)
except Exception as e:
result.failed.append(f"{file.name}: {e}")
return result
result = ingest_directory("./clinical_references/")
print(result.summary())
# Loaded: 847 documents | Failed: 2 files | Skipped: 5 filesLoader Comparison
| Loader | Source | Key Option | Returns |
|---|---|---|---|
| PyPDFLoader | Local PDF | extract_images=True | One doc per page |
| PyPDFDirectoryLoader | PDF folder | glob="*.pdf" | All PDFs in dir |
| WebBaseLoader | URL(s) | bs_kwargs for CSS filtering | One doc per URL |
| CSVLoader | CSV file | source_column, metadata_columns | One doc per row |
| JSONLoader | JSON file | jq_schema | Extracted text nodes |
| DataFrameLoader | Pandas DataFrame | page_content_column | One doc per row |
| SQLDatabaseLoader | SQL DB | query, page_content_mapper | One doc per row |
| Custom BaseLoader | Anything | Implement lazy_load() | Anything you define |