Skip to main content

How to Delete Duplicate Files in Python

Finding and removing duplicate files requires comparing file contents efficiently. Hashing generates a unique fingerprint for each file, allowing quick duplicate detection without comparing entire file contents byte-by-byte.

Memory-Safe File Hashing

Read files in chunks to handle large files without exhausting memory:

import hashlib
from pathlib import Path

def get_file_hash(filepath: Path, algorithm: str = "md5") -> str | None:
"""Calculate hash of a file using chunked reading."""
hasher = hashlib.new(algorithm)

try:
with open(filepath, "rb") as f:
while chunk := f.read(8192):
hasher.update(chunk)
return hasher.hexdigest()
except (OSError, PermissionError):
return None
tip

Use larger chunk sizes (8KB-64KB) for better performance on modern storage. The walrus operator (:=) cleanly handles the read-until-empty pattern.

Basic Duplicate Finder

Scan a directory tree and identify duplicates:

import os
import hashlib
from pathlib import Path
from collections import defaultdict

def get_file_hash(filepath: Path, algorithm: str = "md5") -> str | None:
"""Calculate hash of a file using chunked reading."""
hasher = hashlib.new(algorithm)

try:
with open(filepath, "rb") as f:
while chunk := f.read(8192):
hasher.update(chunk)
return hasher.hexdigest()
except (OSError, PermissionError):
return None

def find_duplicates(root_folder: str) -> dict[str, list[Path]]:
"""Find all duplicate files in a directory tree."""
hash_to_files = defaultdict(list)

for folder, _, files in os.walk(root_folder):
for filename in files:
file_path = Path(folder) / filename

file_hash = get_file_hash(file_path)
if file_hash:
hash_to_files[file_hash].append(file_path)

# Return only entries with duplicates
return {h: paths for h, paths in hash_to_files.items() if len(paths) > 1}

# Find duplicates
duplicates = find_duplicates("./your-path")

for file_hash, paths in duplicates.items():
print(f"\nDuplicate set ({file_hash[:8]}...):")
for path in paths:
print(f" {path}")

Optimized Duplicate Detection

Check file size first to avoid hashing files that can't possibly match:

import os
import hashlib
from pathlib import Path
from collections import defaultdict

def get_file_hash(filepath: Path, algorithm: str = "md5") -> str | None:
"""Calculate hash of a file using chunked reading."""
hasher = hashlib.new(algorithm)

try:
with open(filepath, "rb") as f:
while chunk := f.read(8192):
hasher.update(chunk)
return hasher.hexdigest()
except (OSError, PermissionError):
return None

def find_duplicates_optimized(root_folder: str) -> dict[str, list[Path]]:
"""Find duplicates using size pre-filtering for better performance."""

# Step 1: Group files by size
size_to_files = defaultdict(list)

for folder, _, files in os.walk(root_folder):
for filename in files:
file_path = Path(folder) / filename
try:
size = file_path.stat().st_size
size_to_files[size].append(file_path)
except OSError:
continue

# Step 2: Only hash files with matching sizes
hash_to_files = defaultdict(list)

for size, paths in size_to_files.items():
if len(paths) < 2:
continue # No duplicates possible

for file_path in paths:
file_hash = get_file_hash(file_path)
if file_hash:
hash_to_files[file_hash].append(file_path)

return {h: paths for h, paths in hash_to_files.items() if len(paths) > 1}

# Find duplicates
duplicates = find_duplicates_optimized("./your-path")

for file_hash, paths in duplicates.items():
print(f"\nDuplicate set ({file_hash[:8]}...):")
for path in paths:
print(f" {path}")

Safe Deletion with Preview

Preview duplicates before deletion and keep the oldest file:

import os
import hashlib
from pathlib import Path
from datetime import datetime

def get_file_hash(filepath: Path, algorithm: str = "md5") -> str | None:
... # omitted, se example above

def find_duplicates_optimized(root_folder: str) -> dict[str, list[Path]]:
... # omitted, se example above

def delete_duplicates_safe(
root_folder: str,
dry_run: bool = True,
keep_strategy: str = "oldest"
) -> dict:
"""Delete duplicate files with preview and configurable keep strategy."""

duplicates = find_duplicates_optimized(root_folder)
stats = {"found": 0, "deleted": 0, "freed_bytes": 0}

for file_hash, paths in duplicates.items():
stats["found"] += len(paths) - 1

# Sort by modification time
sorted_paths = sorted(paths, key=lambda p: p.stat().st_mtime)

if keep_strategy == "oldest":
keep = sorted_paths[0]
remove = sorted_paths[1:]
elif keep_strategy == "newest":
keep = sorted_paths[-1]
remove = sorted_paths[:-1]
else:
keep = sorted_paths[0]
remove = sorted_paths[1:]

print(f"\nKeeping: {keep}")

for dup_path in remove:
size = dup_path.stat().st_size

if dry_run:
print(f" Would delete: {dup_path} ({size:,} bytes)")
else:
try:
dup_path.unlink()
stats["deleted"] += 1
stats["freed_bytes"] += size
print(f" Deleted: {dup_path}")
except OSError as e:
print(f" Failed to delete {dup_path}: {e}")

return stats

# Preview first
stats = delete_duplicates_safe("./documents", dry_run=True)
print(f"\nFound {stats['found']} duplicates")

# Then actually delete
# stats = delete_duplicates_safe("./documents", dry_run=False)
warning

Always run with dry_run=True first to preview what will be deleted. There is no undo for file deletion.

Moving Duplicates Instead of Deleting

Safer approach that moves duplicates to a staging folder:

import os
import hashlib
import shutil
from pathlib import Path
from datetime import datetime

def get_file_hash(filepath: Path, algorithm: str = "md5") -> str | None:
... # omitted, se example above

def find_duplicates_optimized(root_folder: str) -> dict[str, list[Path]]:
... # omitted, se example above

def move_duplicates(root_folder: str, staging_folder: str = "./duplicates") -> int:
"""Move duplicate files to a staging folder for review."""

staging = Path(staging_folder)
staging.mkdir(exist_ok=True)

duplicates = find_duplicates_optimized(root_folder)
moved_count = 0

for file_hash, paths in duplicates.items():
# Keep the first one, move the rest
for dup_path in paths[1:]:
# Create unique name to avoid collisions
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
dest_name = f"{timestamp}_{dup_path.name}"
dest_path = staging / dest_name

try:
shutil.move(str(dup_path), str(dest_path))
moved_count += 1
print(f"Moved: {dup_path.name} -> {dest_path}")
except OSError as e:
print(f"Failed to move {dup_path}: {e}")

return moved_count

moved = move_duplicates("./photos", "./photo_duplicates")
print(f"\nMoved {moved} duplicate files")

Filtering by File Type

Limit duplicate detection to specific file extensions:

def find_duplicates_filtered(
root_folder: str,
extensions: set[str] | None = None,
min_size: int = 0,
max_size: int | None = None
) -> dict[str, list[Path]]:
"""Find duplicates with optional filtering."""

size_to_files = defaultdict(list)

for folder, _, files in os.walk(root_folder):
for filename in files:
file_path = Path(folder) / filename

# Filter by extension
if extensions:
if file_path.suffix.lower() not in extensions:
continue

try:
size = file_path.stat().st_size

# Filter by size
if size < min_size:
continue
if max_size and size > max_size:
continue

size_to_files[size].append(file_path)
except OSError:
continue

# Hash only potential duplicates
hash_to_files = defaultdict(list)

for paths in size_to_files.values():
if len(paths) < 2:
continue

for file_path in paths:
file_hash = get_file_hash(file_path)
if file_hash:
hash_to_files[file_hash].append(file_path)

return {h: paths for h, paths in hash_to_files.items() if len(paths) > 1}

# Find duplicate images over 100KB
duplicates = find_duplicates_filtered(
"./photos",
extensions={".jpg", ".jpeg", ".png", ".gif"},
min_size=100 * 1024
)

Progress Reporting for Large Scans

Add progress feedback for large directory scans:

from pathlib import Path
import os

def find_duplicates_with_progress(root_folder: str) -> dict[str, list[Path]]:
"""Find duplicates with progress reporting."""

# First pass: count files
print("Counting files...")
total_files = sum(1 for _, _, files in os.walk(root_folder) for _ in files)
print(f"Found {total_files:,} files")

# Second pass: hash files
size_to_files = defaultdict(list)
processed = 0

for folder, _, files in os.walk(root_folder):
for filename in files:
file_path = Path(folder) / filename

try:
size = file_path.stat().st_size
size_to_files[size].append(file_path)
except OSError:
pass

processed += 1
if processed % 1000 == 0:
print(f"Scanned: {processed:,}/{total_files:,} files", end="\r")

print(f"\nHashing potential duplicates...")

hash_to_files = defaultdict(list)
candidates = sum(len(p) for p in size_to_files.values() if len(p) > 1)
hashed = 0

for paths in size_to_files.values():
if len(paths) < 2:
continue

for file_path in paths:
file_hash = get_file_hash(file_path)
if file_hash:
hash_to_files[file_hash].append(file_path)

hashed += 1
if hashed % 100 == 0:
print(f"Hashed: {hashed:,}/{candidates:,} files", end="\r")

print(f"\nComplete!")

return {h: paths for h, paths in hash_to_files.items() if len(paths) > 1}

Algorithm Comparison

StepMethodPurpose
Traversalos.walk()Recursive directory scanning
Pre-filterFile size comparisonSkip files that can't match
Fingerprinthashlib.md5()Unique content identifier
Chunked readf.read(8192)Prevent memory overflow
note

MD5 is sufficient for duplicate detection despite cryptographic weaknesses. For security-critical applications, use SHA-256 instead: hashlib.sha256().

The size-based pre-filtering optimization can dramatically reduce hashing time in directories with many unique files, since files of different sizes cannot be duplicates.