How to Check if Two PDF Documents are Identical in Python
Comparing PDF files requires different approaches depending on your definition of "identical." A binary comparison checks if files are exact byte-for-byte copies, while content comparison focuses on the actual text or visual appearance.
This guide covers all three methods: hash-based verification, text extraction, and visual comparison.
Binary Comparison (Exact Match)
For verifying backups or detecting file corruption, compare SHA-256 hashes. This method detects any difference, including metadata changes like timestamps:
import hashlib
import os
def get_file_hash(filepath):
"""Generate SHA-256 hash of file contents."""
hasher = hashlib.sha256()
with open(filepath, "rb") as f:
while chunk := f.read(65536):
hasher.update(chunk)
return hasher.hexdigest()
def is_exact_match(pdf1, pdf2):
"""Check if two PDFs are byte-for-byte identical."""
# Quick size check first
if os.path.getsize(pdf1) != os.path.getsize(pdf2):
return False
return get_file_hash(pdf1) == get_file_hash(pdf2)
# Usage
if is_exact_match("report_v1.pdf", "report_backup.pdf"):
print("Files are identical")
else:
print("Files differ")
Hash comparison is ideal for backup verification, detecting transmission errors, or confirming file integrity. It's fast and catches any modification, but will report files as different even if only metadata (like creation date) changed.
Text Content Comparison
When files have different metadata but you need to verify the actual content is the same, extract and compare text using PyMuPDF:
pip install pymupdf
import fitz # PyMuPDF
def extract_text(filepath):
"""Extract all text from a PDF document."""
doc = fitz.open(filepath)
text = ""
for page in doc:
text += page.get_text()
doc.close()
return text
def is_content_match(pdf1, pdf2):
"""Compare PDFs by text content, ignoring metadata."""
text1 = extract_text(pdf1)
text2 = extract_text(pdf2)
# Normalize whitespace for fair comparison
normalized1 = " ".join(text1.split())
normalized2 = " ".join(text2.split())
return normalized1 == normalized2
# Usage
if is_content_match("contract_v1.pdf", "contract_v2.pdf"):
print("Documents contain the same text")
else:
print("Text content differs")
Detailed Text Comparison with Diff
To see exactly what changed between documents:
import fitz
import difflib
def compare_text_detailed(pdf1, pdf2):
"""Generate a detailed diff of text differences."""
text1 = extract_text(pdf1).splitlines()
text2 = extract_text(pdf2).splitlines()
diff = difflib.unified_diff(
text1,
text2,
fromfile=pdf1,
tofile=pdf2,
lineterm=""
)
return list(diff)
# Usage
differences = compare_text_detailed("doc_old.pdf", "doc_new.pdf")
if differences:
print("Differences found:")
for line in differences[:20]: # Show first 20 lines
print(line)
else:
print("No text differences")
Text extraction ignores images, formatting, fonts, and layout. Two PDFs with identical text but different visual appearance will be reported as matching.
Visual Comparison (Page Images)
For layout verification, convert pages to images and compare pixels. This catches formatting changes that text comparison misses:
pip install pdf2image pillow
from pdf2image import convert_from_path
from PIL import ImageChops
import os
def compare_visual(pdf1, pdf2, dpi=150):
"""Compare PDFs visually by converting to images."""
images1 = convert_from_path(pdf1, dpi=dpi)
images2 = convert_from_path(pdf2, dpi=dpi)
# Check page count
if len(images1) != len(images2):
return False, f"Page count differs: {len(images1)} vs {len(images2)}"
# Compare each page
for i, (img1, img2) in enumerate(zip(images1, images2)):
if img1.size != img2.size:
return False, f"Page {i+1} dimensions differ"
diff = ImageChops.difference(img1, img2)
if diff.getbbox() is not None:
return False, f"Page {i+1} has visual differences"
return True, "Documents are visually identical"
# Usage
match, message = compare_visual("layout_v1.pdf", "layout_v2.pdf")
print(message)
Saving Visual Diff Images
When differences are found, save a highlighted diff image for review:
from pdf2image import convert_from_path
from PIL import Image, ImageChops, ImageDraw
def save_visual_diff(pdf1, pdf2, output_dir="diff_output"):
"""Generate diff images highlighting visual differences."""
os.makedirs(output_dir, exist_ok=True)
images1 = convert_from_path(pdf1, dpi=150)
images2 = convert_from_path(pdf2, dpi=150)
differences = []
for i, (img1, img2) in enumerate(zip(images1, images2)):
diff = ImageChops.difference(img1.convert("RGB"), img2.convert("RGB"))
if diff.getbbox():
# Amplify differences for visibility
diff = diff.point(lambda x: min(255, x * 10))
diff.save(f"{output_dir}/page_{i+1}_diff.png")
differences.append(i + 1)
return differences
# Usage
diff_pages = save_visual_diff("before.pdf", "after.pdf")
if diff_pages:
print(f"Differences on pages: {diff_pages}")
Converting PDFs to images requires significant memory and CPU time, especially for large documents. Use lower DPI (72-100) for faster comparison or limit to specific pages when possible.
Complete Comparison Function
Combine all methods for comprehensive PDF comparison:
import hashlib
import os
import fitz
from pdf2image import convert_from_path
from PIL import ImageChops
def compare_pdfs(pdf1, pdf2, mode="auto"):
"""
Compare two PDFs using specified mode.
Modes:
- 'binary': Exact byte comparison (fastest)
- 'text': Text content comparison
- 'visual': Pixel-by-pixel comparison (slowest)
- 'auto': Try binary first, then text if different
"""
result = {
"identical": False,
"mode_used": mode,
"details": ""
}
# Binary comparison
if mode in ("binary", "auto"):
if os.path.getsize(pdf1) == os.path.getsize(pdf2):
if get_file_hash(pdf1) == get_file_hash(pdf2):
result["identical"] = True
result["mode_used"] = "binary"
result["details"] = "Files are byte-for-byte identical"
return result
if mode == "binary":
result["details"] = "Binary content differs"
return result
# Text comparison
if mode in ("text", "auto"):
if is_content_match(pdf1, pdf2):
result["identical"] = True
result["mode_used"] = "text"
result["details"] = "Text content matches (metadata may differ)"
return result
result["details"] = "Text content differs"
return result
# Visual comparison
if mode == "visual":
match, message = compare_visual(pdf1, pdf2)
result["identical"] = match
result["details"] = message
return result
return result
# Usage
result = compare_pdfs("doc1.pdf", "doc2.pdf", mode="auto")
print(f"Identical: {result['identical']}")
print(f"Method: {result['mode_used']}")
print(f"Details: {result['details']}")
Summary
| Method | Detects | Speed | Use Case |
|---|---|---|---|
| SHA-256 hash | Any byte difference | Fast | Backup verification, integrity checks |
| Text extraction | Text content changes | Medium | Document review, content validation |
| Visual comparison | Layout and formatting | Slow | Design QA, regression testing |
Start with hash comparison for speed. If files differ and you need to understand why, use text extraction to check content changes. Reserve visual comparison for layout-critical workflows where formatting matters as much as content.