How to Compare YAML Files and Keys in Python
YAML is the standard format for configuration in Kubernetes, Docker Compose, Ansible, and many CI/CD systems. Comparing YAML files requires understanding nested structures, not just line-by-line text differences.
This guide covers deep comparison techniques, key validation, and security best practices for YAML handling.
Deep Comparison with DeepDiffā
For complex nested YAML files, DeepDiff provides detailed change reports showing exactly what changed, where it changed, and how.
pip install deepdiff pyyaml
Consider the following files:
database:
port: 5432
api:
timeout: 30
legacy:
old_setting: true
database:
port: 5433
api:
timeout: 60
cache:
enabled: true
This is the script for finding exactly what changed, where it changed, and how:
import yaml
from deepdiff import DeepDiff
def compare_yaml_files(file1, file2):
"""Compare two YAML files and return detailed differences."""
with open(file1) as f1, open(file2) as f2:
data1 = yaml.safe_load(f1)
data2 = yaml.safe_load(f2)
# ignore_order=True treats [a, b] as equal to [b, a]
diff = DeepDiff(data1, data2, ignore_order=True)
return diff
# Usage
diff = compare_yaml_files('config_v1.yaml', 'config_v2.yaml')
if not diff:
print("Files are identical")
else:
print("Differences found:")
for change_type, changes in diff.items():
print(f"\n{change_type}:")
for change in changes:
print(f" {change}")
Example Output:
Differences found:
dictionary_item_added:
root['cache']
dictionary_item_removed:
root['legacy']
values_changed:
root['database']['port']
root['api']['timeout']
Never use yaml.load() without specifying a safe loader. The default loader can execute arbitrary Python code embedded in YAML files, creating a severe security vulnerability.
# āļø DANGEROUS - Never do this
data = yaml.load(file)
# ā
SAFE - Always use safe_load
data = yaml.safe_load(file)
Interpreting DeepDiff Resultsā
Consider the following files:
database:
port: 5432
api:
timeout: 30
legacy:
old_setting: true
database:
port: 5433
api:
timeout: 60
cache:
enabled: true
This is the script:
import yaml
from deepdiff import DeepDiff
def analyze_yaml_changes(file1, file2):
"""Analyze and categorize YAML changes."""
with open(file1) as f1, open(file2) as f2:
data1 = yaml.safe_load(f1)
data2 = yaml.safe_load(f2)
diff = DeepDiff(data1, data2, ignore_order=True)
report = {
'added': list(diff.get('dictionary_item_added', [])),
'removed': list(diff.get('dictionary_item_removed', [])),
'modified': [],
'type_changes': []
}
# Extract value changes with before/after
for path, change in diff.get('values_changed', {}).items():
report['modified'].append({
'path': path,
'old': change['old_value'],
'new': change['new_value']
})
# Type changes (e.g., string to integer)
for path, change in diff.get('type_changes', {}).items():
report['type_changes'].append({
'path': path,
'old_type': type(change['old_value']).__name__,
'new_type': type(change['new_value']).__name__
})
return report
# Usage
report = analyze_yaml_changes('config_v1.yaml', 'config_v2.yaml')
print(f"Added keys: {len(report['added'])}")
print(f"Removed keys: {len(report['removed'])}")
print(f"Modified values: {len(report['modified'])}")
Output:
Added keys: 1
Removed keys: 1
Modified values: 2
Quick Key Validation with Setsā
Consider the following file:
apiVersion: v1
kind: Pod
metadata:
name: example
docker:
containers: []
This is the script for simple validation of required top-level keys:
import yaml
def validate_required_keys(filepath, required_keys):
"""Check if YAML file contains all required keys."""
with open(filepath) as f:
data = yaml.safe_load(f)
if data is None:
return False, "Empty YAML file"
actual_keys = set(data.keys())
missing = required_keys - actual_keys
extra = actual_keys - required_keys
if missing:
return False, f"Missing required keys: {missing}"
return True, f"All required keys present. Extra keys: {extra}"
# Usage
required = {'apiVersion', 'kind', 'metadata', 'spec'}
valid, message = validate_required_keys('deployment.yaml', required)
print(message)
Output:
Missing required keys: {'spec'}
Nested Key Validationā
Consider the following files:
database:
port: 5432
api:
timeout: 30
legacy:
old_setting: true
database:
port: 5433
api:
timeout: 60
cache:
enabled: true
This is the script:
import yaml
def get_nested_keys(data, prefix=''):
"""Recursively extract all key paths from nested dict."""
keys = set()
if isinstance(data, dict):
for key, value in data.items():
full_path = f"{prefix}.{key}" if prefix else key
keys.add(full_path)
keys.update(get_nested_keys(value, full_path))
elif isinstance(data, list):
for i, item in enumerate(data):
keys.update(get_nested_keys(item, f"{prefix}[{i}]"))
return keys
def compare_yaml_keys(file1, file2):
"""Compare key structures between two YAML files."""
with open(file1) as f1, open(file2) as f2:
data1 = yaml.safe_load(f1)
data2 = yaml.safe_load(f2)
keys1 = get_nested_keys(data1)
keys2 = get_nested_keys(data2)
return {
'only_in_first': keys1 - keys2,
'only_in_second': keys2 - keys1,
'common': keys1 & keys2
}
# Usage
result = compare_yaml_keys('schema_v1.yaml', 'schema_v2.yaml')
print(f"Keys added: {result['only_in_second']}")
print(f"Keys removed: {result['only_in_first']}")
Output:
Keys added: {'app.config.timeout'}
Keys removed: set()
Comparing Multi-Document YAML Filesā
YAML files can contain multiple documents separated by ---.
Consider the following files:
---
kind: Service
metadata:
name: svc1
---
kind: Deployment
metadata:
name: app1
---
kind: Service
metadata:
name: svc1
---
kind: Deployment
metadata:
name: app2
This is the script:
import yaml
from deepdiff import DeepDiff
def compare_multi_doc_yaml(file1, file2):
"""Compare YAML files with multiple documents."""
with open(file1) as f1, open(file2) as f2:
docs1 = list(yaml.safe_load_all(f1))
docs2 = list(yaml.safe_load_all(f2))
if len(docs1) != len(docs2):
return {
'document_count_changed': True,
'count_file1': len(docs1),
'count_file2': len(docs2)
}
differences = []
for i, (doc1, doc2) in enumerate(zip(docs1, docs2)):
diff = DeepDiff(doc1, doc2, ignore_order=True)
if diff:
differences.append({
'document_index': i,
'changes': diff
})
return {'differences': differences}
# usage
result = compare_multi_doc_yaml('multi_v1.yaml', 'multi_v2.yaml')
Ignoring Specific Keysā
Sometimes certain keys (like timestamps or generated IDs) should be excluded:
Consider the following files:
metadata:
uid: abc123
resourceVersion: "1"
timestamp: "2025-01-01T10:00:00Z"
spec:
replicas: 2
metadata:
uid: xyz999
resourceVersion: "2"
timestamp: "2025-02-01T10:00:00Z"
spec:
replicas: 3
This is the script:
import yaml
from deepdiff import DeepDiff
def compare_yaml_filtered(file1, file2, exclude_paths=None):
"""Compare YAML files while ignoring specified paths."""
exclude_paths = exclude_paths or []
with open(file1) as f1, open(file2) as f2:
data1 = yaml.safe_load(f1)
data2 = yaml.safe_load(f2)
# DeepDiff accepts regex patterns for exclusion
diff = DeepDiff(
data1, data2,
ignore_order=True,
exclude_paths=exclude_paths,
exclude_regex_paths=[r"root\['metadata'\]\['.*timestamp.*'\]"]
)
return diff
# Usage
diff = compare_yaml_filtered(
'deploy_v1.yaml',
'deploy_v2.yaml',
exclude_paths=["root['metadata']['uid']", "root['metadata']['resourceVersion']"]
)
print(diff)
Generating Human-Readable Reportsā
Consider the following files:
database:
port: 5432
api:
timeout: 30
legacy:
old_setting: true
database:
port: 5433
api:
timeout: 60
cache:
enabled: true
The following is an example of function to make reports clear and understandable:
import yaml
from deepdiff import DeepDiff
def generate_yaml_diff_report(file1, file2):
"""Generate a formatted diff report."""
with open(file1) as f1, open(file2) as f2:
data1 = yaml.safe_load(f1)
data2 = yaml.safe_load(f2)
diff = DeepDiff(data1, data2, ignore_order=True, verbose_level=2)
lines = [f"Comparing {file1} -> {file2}", "=" * 50]
if not diff:
lines.append("No differences found.")
return "\n".join(lines)
if 'dictionary_item_added' in diff:
lines.append("\nš¢ ADDED:")
for item in diff['dictionary_item_added']:
lines.append(f" + {item}")
if 'dictionary_item_removed' in diff:
lines.append("\nš“ REMOVED:")
for item in diff['dictionary_item_removed']:
lines.append(f" - {item}")
if 'values_changed' in diff:
lines.append("\nš” MODIFIED:")
for path, change in diff['values_changed'].items():
lines.append(f" {path}:")
lines.append(f" old: {change['old_value']}")
lines.append(f" new: {change['new_value']}")
return "\n".join(lines)
# Usage
report = generate_yaml_diff_report('config_1.yaml', 'config_2.yaml')
print(report)
Output:
š¢ ADDED:
+ root['cache']
š“ REMOVED:
- root['legacy']
š” MODIFIED:
root['database']['port']:
old: 5432
new: 5433
root['api']['timeout']:
old: 30
new: 60
Summaryā
| Scenario | Tool | Features |
|---|---|---|
| Full comparison | DeepDiff | Detects value changes, additions, removals in nested structures |
| Key validation | set() operations | Fast check for required/missing keys |
| Structure comparison | Recursive key extraction | Compare schemas without values |
| Multi-document | yaml.safe_load_all() | Handle --- separated documents |
Use DeepDiff with ignore_order=True for comparing configuration files. It provides actionable details like "Port changed from 80 to 8080" rather than just "files differ." For simple key presence checks, set operations are faster and require no additional dependencies.