Skip to main content

How to Implement Spell Checking with NLTK in Python

Spell checking relies on comparing misspelled words against a dictionary and finding the closest matches using string similarity algorithms. This guide covers implementing spell checking from scratch with NLTK and using production-ready alternatives.

Understanding Edit Distance

Edit distance (also called Levenshtein distance) measures the minimum number of single-character operations needed to transform one word into another. These operations include insertions, deletions, and substitutions.

from nltk.metrics.distance import edit_distance

# Examples of edit distance
print(edit_distance("happy", "happpy")) # 1 (delete one 'p')
print(edit_distance("cat", "car")) # 1 (substitute 't' for 'r')
print(edit_distance("hello", "hallo")) # 1 (substitute 'e' for 'a')
print(edit_distance("python", "pithon")) # 1 (substitute 'y' for 'i')
print(edit_distance("book", "back")) # 2 (two substitutions)

Basic Spell Checker with NLTK

Build a simple spell checker using NLTK's word corpus and edit distance:

from nltk.metrics.distance import edit_distance
from nltk.corpus import words

# Download required data (run once)
# import nltk
# nltk.download('words')

correct_words = words.words()
print(f"Dictionary size: {len(correct_words):,} words")

def suggest_correction(misspelled):
"""Find the closest dictionary word to a misspelled word."""
misspelled_lower = misspelled.lower()

# Filter candidates by first letter for performance
candidates = [w for w in correct_words
if w[0] == misspelled_lower[0]]

# Calculate distances to all candidates
distances = [(edit_distance(misspelled_lower, word), word)
for word in candidates]

# Return the word with minimum distance
return min(distances, key=lambda x: x[0])[1]

# Test the spell checker
print(suggest_correction("happpy")) # happy
print(suggest_correction("writting")) # writing
print(suggest_correction("occured")) # occurred
note

The NLTK words corpus contains approximately 236,000 English words. Filtering by first letter significantly improves performance but may miss suggestions when the first letter is wrong.

Enhanced Spell Checker

Improve the basic implementation with multiple suggestions and better handling:

from nltk.metrics.distance import edit_distance
from nltk.corpus import words
from collections import defaultdict

class SpellChecker:
def __init__(self):
self.word_set = set(words.words())
self.word_list = list(self.word_set)

# Index words by first letter for faster lookup
self.by_first_letter = defaultdict(list)
for word in self.word_list:
self.by_first_letter[word[0].lower()].append(word.lower())

def is_correct(self, word):
"""Check if a word is spelled correctly."""
return word.lower() in self.word_set

def suggest(self, misspelled, num_suggestions=5):
"""Return top N spelling suggestions."""
misspelled_lower = misspelled.lower()

# If already correct, return it
if self.is_correct(misspelled):
return [misspelled]

# Get candidates starting with same letter
first_letter = misspelled_lower[0]
candidates = self.by_first_letter.get(first_letter, [])

# Also check adjacent letters on keyboard for typos
adjacent = self._get_adjacent_letters(first_letter)
for letter in adjacent:
candidates.extend(self.by_first_letter.get(letter, []))

# Calculate distances
scored = [(edit_distance(misspelled_lower, word), word)
for word in candidates]

# Sort by distance and return top suggestions
scored.sort(key=lambda x: x[0])
return [word for _, word in scored[:num_suggestions]]

def _get_adjacent_letters(self, letter):
"""Get keyboard-adjacent letters for common typo handling."""
keyboard = {
'q': 'wa', 'w': 'qeas', 'e': 'wsdr', 'r': 'edft',
't': 'rfgy', 'y': 'tghu', 'u': 'yhji', 'i': 'ujko',
'o': 'iklp', 'p': 'ol', 'a': 'qwsz', 's': 'awedxz',
'd': 'serfcx', 'f': 'drtgvc', 'g': 'ftyhbv', 'h': 'gyujnb',
'j': 'huikmn', 'k': 'jiolm', 'l': 'kop', 'z': 'asx',
'x': 'zsdc', 'c': 'xdfv', 'v': 'cfgb', 'b': 'vghn',
'n': 'bhjm', 'm': 'njk'
}
return keyboard.get(letter.lower(), '')

# Usage
checker = SpellChecker()

print(checker.is_correct("python")) # True
print(checker.is_correct("pythn")) # False

print(checker.suggest("happpy")) # ['happy', 'nappy', ...]
print(checker.suggest("accomodate")) # ['accommodate', ...]
print(checker.suggest("recieve")) # ['receive', ...]

Checking Text for Spelling Errors

Process entire sentences or documents:

import re
from nltk.corpus import words

word_set = set(w.lower() for w in words.words())

def find_misspellings(text):
"""Find all misspelled words in text."""
# Extract words using regex
text_words = re.findall(r'\b[a-zA-Z]+\b', text)

misspelled = []
for word in text_words:
if word.lower() not in word_set:
misspelled.append(word)

return misspelled

text = """
The quikc brown fox jumps over the lazzy dog.
This sentense contains severl misspeled words.
"""

errors = find_misspellings(text)
print(f"Misspelled words: {errors}")
# ['quikc', 'lazzy', 'sentense', 'severl', 'misspeled']

Using pyspellchecker for Production

For real applications, use the optimized pyspellchecker library:

pip install pyspellchecker
from spellchecker import SpellChecker

spell = SpellChecker()

# Find misspelled words
text = "The quikc brown fox jumps ovr the lazzy dog"
words_list = text.split()

misspelled = spell.unknown(words_list)
print(f"Misspelled: {misspelled}")
# {'quikc', 'ovr', 'lazzy'}

# Get corrections
for word in misspelled:
correction = spell.correction(word)
candidates = spell.candidates(word)
print(f"{word} -> {correction}")
print(f" All candidates: {candidates}")

Output:

quikc -> quick
All candidates: {'quick', 'quicks'}
ovr -> over
All candidates: {'over', 'or', 'our'}
lazzy -> lazy
All candidates: {'lazy', 'jazzy'}

Additional pyspellchecker Features

from spellchecker import SpellChecker

spell = SpellChecker()

# Check word frequency (likelihood)
print(spell.word_frequency['the']) # High frequency
print(spell.word_frequency['python']) # Lower frequency

# Add custom words to dictionary
spell.word_frequency.add('pythonic')
spell.word_frequency.add('numpy')

# Now these are recognized as correct
print(spell.unknown(['pythonic', 'numpy'])) # set()

# Load custom word list
spell.word_frequency.load_words(['tensorflow', 'pytorch', 'sklearn'])
tip

pyspellchecker uses word frequency data to rank suggestions, making it more accurate for common words. It is significantly faster than the NLTK approach and better suited for production use.

Using TextBlob as an Alternative

TextBlob provides another simple interface for spell checking:

pip install textblob
from textblob import TextBlob

# Correct a sentence
text = TextBlob("I havv a speling eror")
corrected = text.correct()
print(corrected) # I have a spelling error

# Get word suggestions
word = TextBlob("happpy").words[0]
print(word.spellcheck())
# [('happy', 1.0)] # Word and confidence score

Performance Comparison

import time
from nltk.metrics.distance import edit_distance
from nltk.corpus import words
from spellchecker import SpellChecker

# Test word
test_word = "accommodatoin"

# NLTK approach
def nltk_correct(misspelled):
correct_words = words.words()
candidates = [w for w in correct_words if w[0] == misspelled[0].lower()]
distances = [(edit_distance(misspelled.lower(), w), w) for w in candidates]
return min(distances, key=lambda x: x[0])[1]

# pyspellchecker approach
spell = SpellChecker()

# Benchmark
start = time.time()
for _ in range(10):
nltk_correct(test_word)
nltk_time = time.time() - start

start = time.time()
for _ in range(100):
spell.correction(test_word)
pyspell_time = time.time() - start

print(f"NLTK (10 iterations): {nltk_time:.3f}s")
print(f"pyspellchecker (100 iterations): {pyspell_time:.3f}s")

Method Comparison

MethodSpeedAccuracyBest For
NLTK edit distance🐢 SlowGoodLearning algorithms
pyspellchecker⚡ FastExcellentProduction applications
TextBlob⚡ FastGoodQuick prototyping

Summary

Use NLTK's edit distance when you want to understand how spell checking algorithms work or need fine-grained control over the matching process. For production applications, switch to pyspellchecker: it is faster, uses word frequency for better suggestions, and supports custom dictionaries out of the box.