How to Implement Spell Checking with NLTK in Python
Spell checking relies on comparing misspelled words against a dictionary and finding the closest matches using string similarity algorithms. This guide covers implementing spell checking from scratch with NLTK and using production-ready alternatives.
Understanding Edit Distance
Edit distance (also called Levenshtein distance) measures the minimum number of single-character operations needed to transform one word into another. These operations include insertions, deletions, and substitutions.
from nltk.metrics.distance import edit_distance
# Examples of edit distance
print(edit_distance("happy", "happpy")) # 1 (delete one 'p')
print(edit_distance("cat", "car")) # 1 (substitute 't' for 'r')
print(edit_distance("hello", "hallo")) # 1 (substitute 'e' for 'a')
print(edit_distance("python", "pithon")) # 1 (substitute 'y' for 'i')
print(edit_distance("book", "back")) # 2 (two substitutions)
Basic Spell Checker with NLTK
Build a simple spell checker using NLTK's word corpus and edit distance:
from nltk.metrics.distance import edit_distance
from nltk.corpus import words
# Download required data (run once)
# import nltk
# nltk.download('words')
correct_words = words.words()
print(f"Dictionary size: {len(correct_words):,} words")
def suggest_correction(misspelled):
"""Find the closest dictionary word to a misspelled word."""
misspelled_lower = misspelled.lower()
# Filter candidates by first letter for performance
candidates = [w for w in correct_words
if w[0] == misspelled_lower[0]]
# Calculate distances to all candidates
distances = [(edit_distance(misspelled_lower, word), word)
for word in candidates]
# Return the word with minimum distance
return min(distances, key=lambda x: x[0])[1]
# Test the spell checker
print(suggest_correction("happpy")) # happy
print(suggest_correction("writting")) # writing
print(suggest_correction("occured")) # occurred
The NLTK words corpus contains approximately 236,000 English words. Filtering by first letter significantly improves performance but may miss suggestions when the first letter is wrong.
Enhanced Spell Checker
Improve the basic implementation with multiple suggestions and better handling:
from nltk.metrics.distance import edit_distance
from nltk.corpus import words
from collections import defaultdict
class SpellChecker:
def __init__(self):
self.word_set = set(words.words())
self.word_list = list(self.word_set)
# Index words by first letter for faster lookup
self.by_first_letter = defaultdict(list)
for word in self.word_list:
self.by_first_letter[word[0].lower()].append(word.lower())
def is_correct(self, word):
"""Check if a word is spelled correctly."""
return word.lower() in self.word_set
def suggest(self, misspelled, num_suggestions=5):
"""Return top N spelling suggestions."""
misspelled_lower = misspelled.lower()
# If already correct, return it
if self.is_correct(misspelled):
return [misspelled]
# Get candidates starting with same letter
first_letter = misspelled_lower[0]
candidates = self.by_first_letter.get(first_letter, [])
# Also check adjacent letters on keyboard for typos
adjacent = self._get_adjacent_letters(first_letter)
for letter in adjacent:
candidates.extend(self.by_first_letter.get(letter, []))
# Calculate distances
scored = [(edit_distance(misspelled_lower, word), word)
for word in candidates]
# Sort by distance and return top suggestions
scored.sort(key=lambda x: x[0])
return [word for _, word in scored[:num_suggestions]]
def _get_adjacent_letters(self, letter):
"""Get keyboard-adjacent letters for common typo handling."""
keyboard = {
'q': 'wa', 'w': 'qeas', 'e': 'wsdr', 'r': 'edft',
't': 'rfgy', 'y': 'tghu', 'u': 'yhji', 'i': 'ujko',
'o': 'iklp', 'p': 'ol', 'a': 'qwsz', 's': 'awedxz',
'd': 'serfcx', 'f': 'drtgvc', 'g': 'ftyhbv', 'h': 'gyujnb',
'j': 'huikmn', 'k': 'jiolm', 'l': 'kop', 'z': 'asx',
'x': 'zsdc', 'c': 'xdfv', 'v': 'cfgb', 'b': 'vghn',
'n': 'bhjm', 'm': 'njk'
}
return keyboard.get(letter.lower(), '')
# Usage
checker = SpellChecker()
print(checker.is_correct("python")) # True
print(checker.is_correct("pythn")) # False
print(checker.suggest("happpy")) # ['happy', 'nappy', ...]
print(checker.suggest("accomodate")) # ['accommodate', ...]
print(checker.suggest("recieve")) # ['receive', ...]
Checking Text for Spelling Errors
Process entire sentences or documents:
import re
from nltk.corpus import words
word_set = set(w.lower() for w in words.words())
def find_misspellings(text):
"""Find all misspelled words in text."""
# Extract words using regex
text_words = re.findall(r'\b[a-zA-Z]+\b', text)
misspelled = []
for word in text_words:
if word.lower() not in word_set:
misspelled.append(word)
return misspelled
text = """
The quikc brown fox jumps over the lazzy dog.
This sentense contains severl misspeled words.
"""
errors = find_misspellings(text)
print(f"Misspelled words: {errors}")
# ['quikc', 'lazzy', 'sentense', 'severl', 'misspeled']
Using pyspellchecker for Production
For real applications, use the optimized pyspellchecker library:
pip install pyspellchecker
from spellchecker import SpellChecker
spell = SpellChecker()
# Find misspelled words
text = "The quikc brown fox jumps ovr the lazzy dog"
words_list = text.split()
misspelled = spell.unknown(words_list)
print(f"Misspelled: {misspelled}")
# {'quikc', 'ovr', 'lazzy'}
# Get corrections
for word in misspelled:
correction = spell.correction(word)
candidates = spell.candidates(word)
print(f"{word} -> {correction}")
print(f" All candidates: {candidates}")
Output:
quikc -> quick
All candidates: {'quick', 'quicks'}
ovr -> over
All candidates: {'over', 'or', 'our'}
lazzy -> lazy
All candidates: {'lazy', 'jazzy'}
Additional pyspellchecker Features
from spellchecker import SpellChecker
spell = SpellChecker()
# Check word frequency (likelihood)
print(spell.word_frequency['the']) # High frequency
print(spell.word_frequency['python']) # Lower frequency
# Add custom words to dictionary
spell.word_frequency.add('pythonic')
spell.word_frequency.add('numpy')
# Now these are recognized as correct
print(spell.unknown(['pythonic', 'numpy'])) # set()
# Load custom word list
spell.word_frequency.load_words(['tensorflow', 'pytorch', 'sklearn'])
pyspellchecker uses word frequency data to rank suggestions, making it more accurate for common words. It is significantly faster than the NLTK approach and better suited for production use.
Using TextBlob as an Alternative
TextBlob provides another simple interface for spell checking:
pip install textblob
from textblob import TextBlob
# Correct a sentence
text = TextBlob("I havv a speling eror")
corrected = text.correct()
print(corrected) # I have a spelling error
# Get word suggestions
word = TextBlob("happpy").words[0]
print(word.spellcheck())
# [('happy', 1.0)] # Word and confidence score
Performance Comparison
import time
from nltk.metrics.distance import edit_distance
from nltk.corpus import words
from spellchecker import SpellChecker
# Test word
test_word = "accommodatoin"
# NLTK approach
def nltk_correct(misspelled):
correct_words = words.words()
candidates = [w for w in correct_words if w[0] == misspelled[0].lower()]
distances = [(edit_distance(misspelled.lower(), w), w) for w in candidates]
return min(distances, key=lambda x: x[0])[1]
# pyspellchecker approach
spell = SpellChecker()
# Benchmark
start = time.time()
for _ in range(10):
nltk_correct(test_word)
nltk_time = time.time() - start
start = time.time()
for _ in range(100):
spell.correction(test_word)
pyspell_time = time.time() - start
print(f"NLTK (10 iterations): {nltk_time:.3f}s")
print(f"pyspellchecker (100 iterations): {pyspell_time:.3f}s")
Method Comparison
| Method | Speed | Accuracy | Best For |
|---|---|---|---|
| NLTK edit distance | 🐢 Slow | Good | Learning algorithms |
| pyspellchecker | ⚡ Fast | Excellent | Production applications |
| TextBlob |