How to Compare Two Files Line-by-Line in Python
Comparing text files is a common task in DevOps, testing, and configuration management. Python provides two main approaches: a manual loop for speed and custom logic, or difflib for generating human-readable diff reports.
This guide covers both methods along with handling edge cases like files of different lengths.
Consider the two following files for the example in this guide:
# Sample configuration
host = 127.0.0.1
port = 8080
debug = True
max_connections = 100
log_level = INFO
# Sample configuration
host = 127.0.0.1
port = 9090
debug = True
max_connections = 100
log_level = DEBUG
The zip() Loop (Fast and Memory Efficient)
When you need to know if files differ or find the first difference, use zip() to stream both files simultaneously:
def compare_lines(file1, file2):
"""Compare two files line by line, stop at first difference."""
with open(file1) as f1, open(file2) as f2:
for line_num, (line1, line2) in enumerate(zip(f1, f2), start=1):
if line1 != line2:
print(f"Difference at line {line_num}:")
print(f" < {line1.strip()}")
print(f" > {line2.strip()}")
return False
print("Files are identical.")
return True
# Usage
compare_lines("config_1.txt", "config_2.txt")
Output:
Difference at line 3:
< port = 8080
> port = 9090
The zip() function stops when the shorter file ends. If one file has extra lines, they won't be compared. See the next section for handling different file lengths.
Handling Files of Different Lengths
Use itertools.zip_longest to compare files that may have different line counts:
from itertools import zip_longest
def compare_files_full(file1, file2):
"""Compare files completely, including length differences."""
with open(file1) as f1, open(file2) as f2:
differences = []
for line_num, (line1, line2) in enumerate(
zip_longest(f1, f2, fillvalue=None), start=1
):
if line1 is None:
differences.append((line_num, "ADDED", line2.strip()))
elif line2 is None:
differences.append((line_num, "REMOVED", line1.strip()))
elif line1 != line2:
differences.append((line_num, "CHANGED", line1.strip(), line2.strip()))
return differences
# Usage
diffs = compare_files_full("config_1.txt", "config_2.txt")
if not diffs:
print("Files are identical")
else:
for diff in diffs:
if diff[1] == "CHANGED":
print(f"Line {diff[0]}: Changed")
print(f" - {diff[2]}")
print(f" + {diff[3]}")
elif diff[1] == "ADDED":
print(f"Line {diff[0]}: Added")
print(f" + {diff[2]}")
else:
print(f"Line {diff[0]}: Removed")
print(f" - {diff[2]}")
Output:
Line 3: Changed
- port = 8080
+ port = 9090
Line 6: Changed
- log_level = INFO
+ log_level = DEBUG
Using difflib for Diff Reports
For Git-style diff output that's familiar to developers, use the standard library difflib:
import difflib
def print_unified_diff(file1, file2):
"""Generate unified diff output like git diff."""
with open(file1) as f1, open(file2) as f2:
lines1 = f1.readlines()
lines2 = f2.readlines()
diff = difflib.unified_diff(
lines1,
lines2,
fromfile=file1,
tofile=file2,
lineterm=''
)
output = list(diff)
if output:
print('\n'.join(output))
else:
print("Files are identical")
print_unified_diff("config_1.txt", "config_2.txt")
Output:
--- config_1.txt
+++ config_2.txt
@@ -1,6 +1,6 @@
# Sample configuration
host = 127.0.0.1
-port = 8080
+port = 9090
debug = True
max_connections = 100
-log_level = INFO
+log_level = DEBUG
difflib reads entire files into memory. For very large files, prefer the streaming zip() approach.
Side-by-Side Comparison
For a visual side-by-side view:
import difflib
def side_by_side_diff(file1, file2):
"""Generate HTML side-by-side comparison."""
with open(file1) as f1, open(file2) as f2:
lines1 = f1.readlines()
lines2 = f2.readlines()
differ = difflib.HtmlDiff()
html = differ.make_file(
lines1,
lines2,
fromdesc=file1,
todesc=file2
)
with open("diff_report.html", "w") as out:
out.write(html)
print("Diff report saved to diff_report.html")
# For terminal output, use context_diff
def context_diff(file1, file2):
"""Show differences with surrounding context."""
with open(file1) as f1, open(file2) as f2:
diff = difflib.context_diff(
f1.readlines(),
f2.readlines(),
fromfile=file1,
tofile=file2,
n=2 # Lines of context
)
for line in diff:
print(line, end='')
side_by_side_diff("config_1.txt", "config_2.txt")
context_diff("config_1.txt", "config_2.txt")
Output:
side_by_side_diff("config_1.txt", "config_2.txt")
context_diff("config_1.txt", "config_2.txt")
Ignoring Whitespace Differences
Sometimes you want to compare content while ignoring formatting:
from itertools import zip_longest
def compare_ignore_whitespace(file1, file2):
"""Compare files ignoring leading/trailing whitespace."""
with open(file1) as f1, open(file2) as f2:
for line_num, (line1, line2) in enumerate(
zip_longest(f1, f2, fillvalue=''), start=1
):
if line1.strip() != line2.strip():
print(f"First difference at line {line_num}:")
print(f" < {line1.rstrip()}")
print(f" > {line2.rstrip()}")
return False
print("Files are identical (ignoring leading/trailing whitespace).")
return True
def compare_ignore_blank_lines(file1, file2):
"""Compare files ignoring blank lines entirely."""
with open(file1) as f1, open(file2) as f2:
lines1 = [l.rstrip() for l in f1 if l.strip()]
lines2 = [l.rstrip() for l in f2 if l.strip()]
if lines1 == lines2:
print("Files are identical (ignoring blank lines).")
return True
# Show first difference
for i, (line1, line2) in enumerate(
zip_longest(lines1, lines2, fillvalue=''), start=1
):
if line1 != line2:
print(f"First Difference at line {i}:")
print(f" < {line1}")
print(f" > {line2}")
break
return False
# Usage
compare_ignore_whitespace("config_1.txt", "config_2.txt")
print()
compare_ignore_blank_lines("config_1.txt", "config_2.txt")
Output:
First difference at line 3:
< port = 8080
> port = 9090
First Difference at logical line 3:
< port = 8080
> port = 9090
Finding Similar Lines with Ratios
Use SequenceMatcher to find how similar two files are:
import difflib
def similarity_ratio(file1, file2):
"""Calculate similarity percentage between files."""
with open(file1) as f1, open(file2) as f2:
content1 = f1.read()
content2 = f2.read()
ratio = difflib.SequenceMatcher(None, content1, content2).ratio()
return ratio * 100
# Usage
similarity = similarity_ratio("config_1.txt", "config_2.txt")
print(f"Files are {similarity:.1f}% similar")
Output:
Files are 93.8% similar
Practical Example: Config File Comparison
A complete example for comparing configuration files:
import difflib
from pathlib import Path
def compare_configs(file1, file2, ignore_comments=True):
"""Compare config files with optional comment filtering."""
def read_config_lines(filepath):
with open(filepath) as f:
lines = f.readlines()
if ignore_comments:
lines = [l for l in lines if not l.strip().startswith('#')]
return lines
lines1 = read_config_lines(file1)
lines2 = read_config_lines(file2)
diff = list(difflib.unified_diff(
lines1, lines2,
fromfile=str(file1),
tofile=str(file2)
))
if diff:
print("Configuration differences found:")
print(''.join(diff))
return False
else:
print("Configurations are equivalent")
return True
# Usage
compare_configs("config_1.txt", "config_2.txt")
Output:
Configuration differences found:
--- config_1.txt
+++ config_2.txt
@@ -1,5 +1,5 @@
host = 127.0.0.1
-port = 8080
+port = 9090
debug = True
max_connections = 100
-log_level = INFO
+log_level = DEBUG
Summary
| Method | Memory Usage | Output Type | Best For |
|---|---|---|---|
zip() loop | Low (streaming) | Boolean / Custom | Quick checks, large files |
zip_longest() | Low (streaming) | Custom | Files of different lengths |
difflib.unified_diff | High (loads file) | Standard diff | Developer-friendly reports |
difflib.HtmlDiff | High | HTML report | Visual comparison |
For simple checks ("Are these configs identical?"), use a zip() loop for memory efficiency and full control. For displaying changes to users or generating reports, use difflib.unified_diff which produces familiar Git-style output.