How to Delete Duplicate Files in Python
Finding and removing duplicate files requires comparing file contents efficiently. Hashing generates a unique fingerprint for each file, allowing quick duplicate detection without comparing entire file contents byte-by-byte.
Memory-Safe File Hashing
Read files in chunks to handle large files without exhausting memory:
import hashlib
from pathlib import Path
def get_file_hash(filepath: Path, algorithm: str = "md5") -> str | None:
"""Calculate hash of a file using chunked reading."""
hasher = hashlib.new(algorithm)
try:
with open(filepath, "rb") as f:
while chunk := f.read(8192):
hasher.update(chunk)
return hasher.hexdigest()
except (OSError, PermissionError):
return None
Use larger chunk sizes (8KB-64KB) for better performance on modern storage. The walrus operator (:=) cleanly handles the read-until-empty pattern.
Basic Duplicate Finder
Scan a directory tree and identify duplicates:
import os
import hashlib
from pathlib import Path
from collections import defaultdict
def get_file_hash(filepath: Path, algorithm: str = "md5") -> str | None:
"""Calculate hash of a file using chunked reading."""
hasher = hashlib.new(algorithm)
try:
with open(filepath, "rb") as f:
while chunk := f.read(8192):
hasher.update(chunk)
return hasher.hexdigest()
except (OSError, PermissionError):
return None
def find_duplicates(root_folder: str) -> dict[str, list[Path]]:
"""Find all duplicate files in a directory tree."""
hash_to_files = defaultdict(list)
for folder, _, files in os.walk(root_folder):
for filename in files:
file_path = Path(folder) / filename
file_hash = get_file_hash(file_path)
if file_hash:
hash_to_files[file_hash].append(file_path)
# Return only entries with duplicates
return {h: paths for h, paths in hash_to_files.items() if len(paths) > 1}
# Find duplicates
duplicates = find_duplicates("./your-path")
for file_hash, paths in duplicates.items():
print(f"\nDuplicate set ({file_hash[:8]}...):")
for path in paths:
print(f" {path}")
Optimized Duplicate Detection
Check file size first to avoid hashing files that can't possibly match:
import os
import hashlib
from pathlib import Path
from collections import defaultdict
def get_file_hash(filepath: Path, algorithm: str = "md5") -> str | None:
"""Calculate hash of a file using chunked reading."""
hasher = hashlib.new(algorithm)
try:
with open(filepath, "rb") as f:
while chunk := f.read(8192):
hasher.update(chunk)
return hasher.hexdigest()
except (OSError, PermissionError):
return None
def find_duplicates_optimized(root_folder: str) -> dict[str, list[Path]]:
"""Find duplicates using size pre-filtering for better performance."""
# Step 1: Group files by size
size_to_files = defaultdict(list)
for folder, _, files in os.walk(root_folder):
for filename in files:
file_path = Path(folder) / filename
try:
size = file_path.stat().st_size
size_to_files[size].append(file_path)
except OSError:
continue
# Step 2: Only hash files with matching sizes
hash_to_files = defaultdict(list)
for size, paths in size_to_files.items():
if len(paths) < 2:
continue # No duplicates possible
for file_path in paths:
file_hash = get_file_hash(file_path)
if file_hash:
hash_to_files[file_hash].append(file_path)
return {h: paths for h, paths in hash_to_files.items() if len(paths) > 1}
# Find duplicates
duplicates = find_duplicates_optimized("./your-path")
for file_hash, paths in duplicates.items():
print(f"\nDuplicate set ({file_hash[:8]}...):")
for path in paths:
print(f" {path}")
Safe Deletion with Preview
Preview duplicates before deletion and keep the oldest file:
import os
import hashlib
from pathlib import Path
from datetime import datetime
def get_file_hash(filepath: Path, algorithm: str = "md5") -> str | None:
... # omitted, se example above
def find_duplicates_optimized(root_folder: str) -> dict[str, list[Path]]:
... # omitted, se example above
def delete_duplicates_safe(
root_folder: str,
dry_run: bool = True,
keep_strategy: str = "oldest"
) -> dict:
"""Delete duplicate files with preview and configurable keep strategy."""
duplicates = find_duplicates_optimized(root_folder)
stats = {"found": 0, "deleted": 0, "freed_bytes": 0}
for file_hash, paths in duplicates.items():
stats["found"] += len(paths) - 1
# Sort by modification time
sorted_paths = sorted(paths, key=lambda p: p.stat().st_mtime)
if keep_strategy == "oldest":
keep = sorted_paths[0]
remove = sorted_paths[1:]
elif keep_strategy == "newest":
keep = sorted_paths[-1]
remove = sorted_paths[:-1]
else:
keep = sorted_paths[0]
remove = sorted_paths[1:]
print(f"\nKeeping: {keep}")
for dup_path in remove:
size = dup_path.stat().st_size
if dry_run:
print(f" Would delete: {dup_path} ({size:,} bytes)")
else:
try:
dup_path.unlink()
stats["deleted"] += 1
stats["freed_bytes"] += size
print(f" Deleted: {dup_path}")
except OSError as e:
print(f" Failed to delete {dup_path}: {e}")
return stats
# Preview first
stats = delete_duplicates_safe("./documents", dry_run=True)
print(f"\nFound {stats['found']} duplicates")
# Then actually delete
# stats = delete_duplicates_safe("./documents", dry_run=False)
Always run with dry_run=True first to preview what will be deleted. There is no undo for file deletion.
Moving Duplicates Instead of Deleting
Safer approach that moves duplicates to a staging folder:
import os
import hashlib
import shutil
from pathlib import Path
from datetime import datetime
def get_file_hash(filepath: Path, algorithm: str = "md5") -> str | None:
... # omitted, se example above
def find_duplicates_optimized(root_folder: str) -> dict[str, list[Path]]:
... # omitted, se example above
def move_duplicates(root_folder: str, staging_folder: str = "./duplicates") -> int:
"""Move duplicate files to a staging folder for review."""
staging = Path(staging_folder)
staging.mkdir(exist_ok=True)
duplicates = find_duplicates_optimized(root_folder)
moved_count = 0
for file_hash, paths in duplicates.items():
# Keep the first one, move the rest
for dup_path in paths[1:]:
# Create unique name to avoid collisions
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
dest_name = f"{timestamp}_{dup_path.name}"
dest_path = staging / dest_name
try:
shutil.move(str(dup_path), str(dest_path))
moved_count += 1
print(f"Moved: {dup_path.name} -> {dest_path}")
except OSError as e:
print(f"Failed to move {dup_path}: {e}")
return moved_count
moved = move_duplicates("./photos", "./photo_duplicates")
print(f"\nMoved {moved} duplicate files")
Filtering by File Type
Limit duplicate detection to specific file extensions:
def find_duplicates_filtered(
root_folder: str,
extensions: set[str] | None = None,
min_size: int = 0,
max_size: int | None = None
) -> dict[str, list[Path]]:
"""Find duplicates with optional filtering."""
size_to_files = defaultdict(list)
for folder, _, files in os.walk(root_folder):
for filename in files:
file_path = Path(folder) / filename
# Filter by extension
if extensions:
if file_path.suffix.lower() not in extensions:
continue
try:
size = file_path.stat().st_size
# Filter by size
if size < min_size:
continue
if max_size and size > max_size:
continue
size_to_files[size].append(file_path)
except OSError:
continue
# Hash only potential duplicates
hash_to_files = defaultdict(list)
for paths in size_to_files.values():
if len(paths) < 2:
continue
for file_path in paths:
file_hash = get_file_hash(file_path)
if file_hash:
hash_to_files[file_hash].append(file_path)
return {h: paths for h, paths in hash_to_files.items() if len(paths) > 1}
# Find duplicate images over 100KB
duplicates = find_duplicates_filtered(
"./photos",
extensions={".jpg", ".jpeg", ".png", ".gif"},
min_size=100 * 1024
)
Progress Reporting for Large Scans
Add progress feedback for large directory scans:
from pathlib import Path
import os
def find_duplicates_with_progress(root_folder: str) -> dict[str, list[Path]]:
"""Find duplicates with progress reporting."""
# First pass: count files
print("Counting files...")
total_files = sum(1 for _, _, files in os.walk(root_folder) for _ in files)
print(f"Found {total_files:,} files")
# Second pass: hash files
size_to_files = defaultdict(list)
processed = 0
for folder, _, files in os.walk(root_folder):
for filename in files:
file_path = Path(folder) / filename
try:
size = file_path.stat().st_size
size_to_files[size].append(file_path)
except OSError:
pass
processed += 1
if processed % 1000 == 0:
print(f"Scanned: {processed:,}/{total_files:,} files", end="\r")
print(f"\nHashing potential duplicates...")
hash_to_files = defaultdict(list)
candidates = sum(len(p) for p in size_to_files.values() if len(p) > 1)
hashed = 0
for paths in size_to_files.values():
if len(paths) < 2:
continue
for file_path in paths:
file_hash = get_file_hash(file_path)
if file_hash:
hash_to_files[file_hash].append(file_path)
hashed += 1
if hashed % 100 == 0:
print(f"Hashed: {hashed:,}/{candidates:,} files", end="\r")
print(f"\nComplete!")
return {h: paths for h, paths in hash_to_files.items() if len(paths) > 1}
Algorithm Comparison
| Step | Method | Purpose |
|---|---|---|
| Traversal | os.walk() | Recursive directory scanning |
| Pre-filter | File size comparison | Skip files that can't match |
| Fingerprint | hashlib.md5() | Unique content identifier |
| Chunked read | f.read(8192) | Prevent memory overflow |
MD5 is sufficient for duplicate detection despite cryptographic weaknesses. For security-critical applications, use SHA-256 instead: hashlib.sha256().
The size-based pre-filtering optimization can dramatically reduce hashing time in directories with many unique files, since files of different sizes cannot be duplicates.