Skip to main content

How to Check if Two PDF Documents are Identical in Python

Comparing PDF files requires different approaches depending on your definition of "identical." A binary comparison checks if files are exact byte-for-byte copies, while content comparison focuses on the actual text or visual appearance.

This guide covers all three methods: hash-based verification, text extraction, and visual comparison.

Binary Comparison (Exact Match)

For verifying backups or detecting file corruption, compare SHA-256 hashes. This method detects any difference, including metadata changes like timestamps:

import hashlib
import os

def get_file_hash(filepath):
"""Generate SHA-256 hash of file contents."""
hasher = hashlib.sha256()
with open(filepath, "rb") as f:
while chunk := f.read(65536):
hasher.update(chunk)
return hasher.hexdigest()

def is_exact_match(pdf1, pdf2):
"""Check if two PDFs are byte-for-byte identical."""
# Quick size check first
if os.path.getsize(pdf1) != os.path.getsize(pdf2):
return False

return get_file_hash(pdf1) == get_file_hash(pdf2)

# Usage
if is_exact_match("report_v1.pdf", "report_backup.pdf"):
print("Files are identical")
else:
print("Files differ")
When to Use Hash Comparison

Hash comparison is ideal for backup verification, detecting transmission errors, or confirming file integrity. It's fast and catches any modification, but will report files as different even if only metadata (like creation date) changed.

Text Content Comparison

When files have different metadata but you need to verify the actual content is the same, extract and compare text using PyMuPDF:

pip install pymupdf
import fitz  # PyMuPDF

def extract_text(filepath):
"""Extract all text from a PDF document."""
doc = fitz.open(filepath)
text = ""
for page in doc:
text += page.get_text()
doc.close()
return text

def is_content_match(pdf1, pdf2):
"""Compare PDFs by text content, ignoring metadata."""
text1 = extract_text(pdf1)
text2 = extract_text(pdf2)

# Normalize whitespace for fair comparison
normalized1 = " ".join(text1.split())
normalized2 = " ".join(text2.split())

return normalized1 == normalized2

# Usage
if is_content_match("contract_v1.pdf", "contract_v2.pdf"):
print("Documents contain the same text")
else:
print("Text content differs")

Detailed Text Comparison with Diff

To see exactly what changed between documents:

import fitz
import difflib

def compare_text_detailed(pdf1, pdf2):
"""Generate a detailed diff of text differences."""
text1 = extract_text(pdf1).splitlines()
text2 = extract_text(pdf2).splitlines()

diff = difflib.unified_diff(
text1,
text2,
fromfile=pdf1,
tofile=pdf2,
lineterm=""
)

return list(diff)

# Usage
differences = compare_text_detailed("doc_old.pdf", "doc_new.pdf")
if differences:
print("Differences found:")
for line in differences[:20]: # Show first 20 lines
print(line)
else:
print("No text differences")
Limitations of Text Comparison

Text extraction ignores images, formatting, fonts, and layout. Two PDFs with identical text but different visual appearance will be reported as matching.

Visual Comparison (Page Images)

For layout verification, convert pages to images and compare pixels. This catches formatting changes that text comparison misses:

pip install pdf2image pillow
from pdf2image import convert_from_path
from PIL import ImageChops
import os

def compare_visual(pdf1, pdf2, dpi=150):
"""Compare PDFs visually by converting to images."""
images1 = convert_from_path(pdf1, dpi=dpi)
images2 = convert_from_path(pdf2, dpi=dpi)

# Check page count
if len(images1) != len(images2):
return False, f"Page count differs: {len(images1)} vs {len(images2)}"

# Compare each page
for i, (img1, img2) in enumerate(zip(images1, images2)):
if img1.size != img2.size:
return False, f"Page {i+1} dimensions differ"

diff = ImageChops.difference(img1, img2)
if diff.getbbox() is not None:
return False, f"Page {i+1} has visual differences"

return True, "Documents are visually identical"

# Usage
match, message = compare_visual("layout_v1.pdf", "layout_v2.pdf")
print(message)

Saving Visual Diff Images

When differences are found, save a highlighted diff image for review:

from pdf2image import convert_from_path
from PIL import Image, ImageChops, ImageDraw

def save_visual_diff(pdf1, pdf2, output_dir="diff_output"):
"""Generate diff images highlighting visual differences."""
os.makedirs(output_dir, exist_ok=True)

images1 = convert_from_path(pdf1, dpi=150)
images2 = convert_from_path(pdf2, dpi=150)

differences = []

for i, (img1, img2) in enumerate(zip(images1, images2)):
diff = ImageChops.difference(img1.convert("RGB"), img2.convert("RGB"))

if diff.getbbox():
# Amplify differences for visibility
diff = diff.point(lambda x: min(255, x * 10))
diff.save(f"{output_dir}/page_{i+1}_diff.png")
differences.append(i + 1)

return differences

# Usage
diff_pages = save_visual_diff("before.pdf", "after.pdf")
if diff_pages:
print(f"Differences on pages: {diff_pages}")
Visual Comparison is Resource Intensive

Converting PDFs to images requires significant memory and CPU time, especially for large documents. Use lower DPI (72-100) for faster comparison or limit to specific pages when possible.

Complete Comparison Function

Combine all methods for comprehensive PDF comparison:

import hashlib
import os
import fitz
from pdf2image import convert_from_path
from PIL import ImageChops

def compare_pdfs(pdf1, pdf2, mode="auto"):
"""
Compare two PDFs using specified mode.

Modes:
- 'binary': Exact byte comparison (fastest)
- 'text': Text content comparison
- 'visual': Pixel-by-pixel comparison (slowest)
- 'auto': Try binary first, then text if different
"""
result = {
"identical": False,
"mode_used": mode,
"details": ""
}

# Binary comparison
if mode in ("binary", "auto"):
if os.path.getsize(pdf1) == os.path.getsize(pdf2):
if get_file_hash(pdf1) == get_file_hash(pdf2):
result["identical"] = True
result["mode_used"] = "binary"
result["details"] = "Files are byte-for-byte identical"
return result

if mode == "binary":
result["details"] = "Binary content differs"
return result

# Text comparison
if mode in ("text", "auto"):
if is_content_match(pdf1, pdf2):
result["identical"] = True
result["mode_used"] = "text"
result["details"] = "Text content matches (metadata may differ)"
return result

result["details"] = "Text content differs"
return result

# Visual comparison
if mode == "visual":
match, message = compare_visual(pdf1, pdf2)
result["identical"] = match
result["details"] = message
return result

return result

# Usage
result = compare_pdfs("doc1.pdf", "doc2.pdf", mode="auto")
print(f"Identical: {result['identical']}")
print(f"Method: {result['mode_used']}")
print(f"Details: {result['details']}")

Summary

MethodDetectsSpeedUse Case
SHA-256 hashAny byte differenceFastBackup verification, integrity checks
Text extractionText content changesMediumDocument review, content validation
Visual comparisonLayout and formattingSlowDesign QA, regression testing
Recommended Approach

Start with hash comparison for speed. If files differ and you need to understand why, use text extraction to check content changes. Reserve visual comparison for layout-critical workflows where formatting matters as much as content.