How to Delete Duplicate Files in Python

Finding and removing duplicate files requires comparing file contents efficiently. Hashing generates a unique fingerprint for each file, allowing quick duplicate detection without comparing entire file contents byte-by-byte.

Memory-Safe File Hashing

Read files in chunks to handle large files without exhausting memory:

import hashlib
from pathlib import Path

def get_file_hash(filepath: Path, algorithm: str = "md5") -> str | None:
    """Calculate hash of a file using chunked reading."""
    hasher = hashlib.new(algorithm)
    
    try:
        with open(filepath, "rb") as f:
            while chunk := f.read(8192):
                hasher.update(chunk)
        return hasher.hexdigest()
    except (OSError, PermissionError):
        return None

tip

Use larger chunk sizes (8KB-64KB) for better performance on modern storage. The walrus operator (:=) cleanly handles the read-until-empty pattern.

Basic Duplicate Finder

Scan a directory tree and identify duplicates:

import os
import hashlib
from pathlib import Path
from collections import defaultdict

def get_file_hash(filepath: Path, algorithm: str = "md5") -> str | None:
    """Calculate hash of a file using chunked reading."""
    hasher = hashlib.new(algorithm)
    
    try:
        with open(filepath, "rb") as f:
            while chunk := f.read(8192):
                hasher.update(chunk)
        return hasher.hexdigest()
    except (OSError, PermissionError):
        return None

def find_duplicates(root_folder: str) -> dict[str, list[Path]]:
    """Find all duplicate files in a directory tree."""
    hash_to_files = defaultdict(list)
    
    for folder, _, files in os.walk(root_folder):
        for filename in files:
            file_path = Path(folder) / filename
            
            file_hash = get_file_hash(file_path)
            if file_hash:
                hash_to_files[file_hash].append(file_path)
    
    # Return only entries with duplicates
    return {h: paths for h, paths in hash_to_files.items() if len(paths) > 1}

# Find duplicates
duplicates = find_duplicates("./your-path")

for file_hash, paths in duplicates.items():
    print(f"\nDuplicate set ({file_hash[:8]}...):")
    for path in paths:
        print(f"  {path}")

Optimized Duplicate Detection

Check file size first to avoid hashing files that can't possibly match:

import os
import hashlib
from pathlib import Path
from collections import defaultdict

def get_file_hash(filepath: Path, algorithm: str = "md5") -> str | None:
    """Calculate hash of a file using chunked reading."""
    hasher = hashlib.new(algorithm)
    
    try:
        with open(filepath, "rb") as f:
            while chunk := f.read(8192):
                hasher.update(chunk)
        return hasher.hexdigest()
    except (OSError, PermissionError):
        return None

def find_duplicates_optimized(root_folder: str) -> dict[str, list[Path]]:
    """Find duplicates using size pre-filtering for better performance."""
    
    # Step 1: Group files by size
    size_to_files = defaultdict(list)
    
    for folder, _, files in os.walk(root_folder):
        for filename in files:
            file_path = Path(folder) / filename
            try:
                size = file_path.stat().st_size
                size_to_files[size].append(file_path)
            except OSError:
                continue
    
    # Step 2: Only hash files with matching sizes
    hash_to_files = defaultdict(list)
    
    for size, paths in size_to_files.items():
        if len(paths) < 2:
            continue  # No duplicates possible
        
        for file_path in paths:
            file_hash = get_file_hash(file_path)
            if file_hash:
                hash_to_files[file_hash].append(file_path)
    
    return {h: paths for h, paths in hash_to_files.items() if len(paths) > 1}

# Find duplicates
duplicates = find_duplicates_optimized("./your-path")

for file_hash, paths in duplicates.items():
    print(f"\nDuplicate set ({file_hash[:8]}...):")
    for path in paths:
        print(f"  {path}")

Safe Deletion with Preview

Preview duplicates before deletion and keep the oldest file:

import os
import hashlib
from pathlib import Path
from datetime import datetime

def get_file_hash(filepath: Path, algorithm: str = "md5") -> str | None:
    ... # omitted, se example above

def find_duplicates_optimized(root_folder: str) -> dict[str, list[Path]]:
    ... # omitted, se example above

def delete_duplicates_safe(
    root_folder: str,
    dry_run: bool = True,
    keep_strategy: str = "oldest"
) -> dict:
    """Delete duplicate files with preview and configurable keep strategy."""
    
    duplicates = find_duplicates_optimized(root_folder)
    stats = {"found": 0, "deleted": 0, "freed_bytes": 0}
    
    for file_hash, paths in duplicates.items():
        stats["found"] += len(paths) - 1
        
        # Sort by modification time
        sorted_paths = sorted(paths, key=lambda p: p.stat().st_mtime)
        
        if keep_strategy == "oldest":
            keep = sorted_paths[0]
            remove = sorted_paths[1:]
        elif keep_strategy == "newest":
            keep = sorted_paths[-1]
            remove = sorted_paths[:-1]
        else:
            keep = sorted_paths[0]
            remove = sorted_paths[1:]
        
        print(f"\nKeeping: {keep}")
        
        for dup_path in remove:
            size = dup_path.stat().st_size
            
            if dry_run:
                print(f"  Would delete: {dup_path} ({size:,} bytes)")
            else:
                try:
                    dup_path.unlink()
                    stats["deleted"] += 1
                    stats["freed_bytes"] += size
                    print(f"  Deleted: {dup_path}")
                except OSError as e:
                    print(f"  Failed to delete {dup_path}: {e}")
    
    return stats

# Preview first
stats = delete_duplicates_safe("./documents", dry_run=True)
print(f"\nFound {stats['found']} duplicates")

# Then actually delete
# stats = delete_duplicates_safe("./documents", dry_run=False)

warning

Always run with dry_run=True first to preview what will be deleted. There is no undo for file deletion.

Moving Duplicates Instead of Deleting

Safer approach that moves duplicates to a staging folder:

import os
import hashlib
import shutil
from pathlib import Path
from datetime import datetime

def get_file_hash(filepath: Path, algorithm: str = "md5") -> str | None:
    ... # omitted, se example above

def find_duplicates_optimized(root_folder: str) -> dict[str, list[Path]]:
    ... # omitted, se example above

def move_duplicates(root_folder: str, staging_folder: str = "./duplicates") -> int:
    """Move duplicate files to a staging folder for review."""
    
    staging = Path(staging_folder)
    staging.mkdir(exist_ok=True)
    
    duplicates = find_duplicates_optimized(root_folder)
    moved_count = 0
    
    for file_hash, paths in duplicates.items():
        # Keep the first one, move the rest
        for dup_path in paths[1:]:
            # Create unique name to avoid collisions
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            dest_name = f"{timestamp}_{dup_path.name}"
            dest_path = staging / dest_name
            
            try:
                shutil.move(str(dup_path), str(dest_path))
                moved_count += 1
                print(f"Moved: {dup_path.name} -> {dest_path}")
            except OSError as e:
                print(f"Failed to move {dup_path}: {e}")
    
    return moved_count

moved = move_duplicates("./photos", "./photo_duplicates")
print(f"\nMoved {moved} duplicate files")

Filtering by File Type

Limit duplicate detection to specific file extensions:

def find_duplicates_filtered(
    root_folder: str,
    extensions: set[str] | None = None,
    min_size: int = 0,
    max_size: int | None = None
) -> dict[str, list[Path]]:
    """Find duplicates with optional filtering."""
    
    size_to_files = defaultdict(list)
    
    for folder, _, files in os.walk(root_folder):
        for filename in files:
            file_path = Path(folder) / filename
            
            # Filter by extension
            if extensions:
                if file_path.suffix.lower() not in extensions:
                    continue
            
            try:
                size = file_path.stat().st_size
                
                # Filter by size
                if size < min_size:
                    continue
                if max_size and size > max_size:
                    continue
                
                size_to_files[size].append(file_path)
            except OSError:
                continue
    
    # Hash only potential duplicates
    hash_to_files = defaultdict(list)
    
    for paths in size_to_files.values():
        if len(paths) < 2:
            continue
        
        for file_path in paths:
            file_hash = get_file_hash(file_path)
            if file_hash:
                hash_to_files[file_hash].append(file_path)
    
    return {h: paths for h, paths in hash_to_files.items() if len(paths) > 1}

# Find duplicate images over 100KB
duplicates = find_duplicates_filtered(
    "./photos",
    extensions={".jpg", ".jpeg", ".png", ".gif"},
    min_size=100 * 1024
)

Progress Reporting for Large Scans

Add progress feedback for large directory scans:

from pathlib import Path
import os

def find_duplicates_with_progress(root_folder: str) -> dict[str, list[Path]]:
    """Find duplicates with progress reporting."""
    
    # First pass: count files
    print("Counting files...")
    total_files = sum(1 for _, _, files in os.walk(root_folder) for _ in files)
    print(f"Found {total_files:,} files")
    
    # Second pass: hash files
    size_to_files = defaultdict(list)
    processed = 0
    
    for folder, _, files in os.walk(root_folder):
        for filename in files:
            file_path = Path(folder) / filename
            
            try:
                size = file_path.stat().st_size
                size_to_files[size].append(file_path)
            except OSError:
                pass
            
            processed += 1
            if processed % 1000 == 0:
                print(f"Scanned: {processed:,}/{total_files:,} files", end="\r")
    
    print(f"\nHashing potential duplicates...")
    
    hash_to_files = defaultdict(list)
    candidates = sum(len(p) for p in size_to_files.values() if len(p) > 1)
    hashed = 0
    
    for paths in size_to_files.values():
        if len(paths) < 2:
            continue
        
        for file_path in paths:
            file_hash = get_file_hash(file_path)
            if file_hash:
                hash_to_files[file_hash].append(file_path)
            
            hashed += 1
            if hashed % 100 == 0:
                print(f"Hashed: {hashed:,}/{candidates:,} files", end="\r")
    
    print(f"\nComplete!")
    
    return {h: paths for h, paths in hash_to_files.items() if len(paths) > 1}

Algorithm Comparison

Step	Method	Purpose
Traversal	`os.walk()`	Recursive directory scanning
Pre-filter	File size comparison	Skip files that can't match
Fingerprint	`hashlib.md5()`	Unique content identifier
Chunked read	`f.read(8192)`	Prevent memory overflow

note

MD5 is sufficient for duplicate detection despite cryptographic weaknesses. For security-critical applications, use SHA-256 instead: hashlib.sha256().

The size-based pre-filtering optimization can dramatically reduce hashing time in directories with many unique files, since files of different sizes cannot be duplicates.

Memory-Safe File Hashing​

Basic Duplicate Finder​

Optimized Duplicate Detection​

Safe Deletion with Preview​

Moving Duplicates Instead of Deleting​

Filtering by File Type​

Progress Reporting for Large Scans​

Algorithm Comparison​

Table of Contents