Skip to main content

How to Compare YAML Files and Keys in Python

YAML is the standard format for configuration in Kubernetes, Docker Compose, Ansible, and many CI/CD systems. Comparing YAML files requires understanding nested structures, not just line-by-line text differences.

This guide covers deep comparison techniques, key validation, and security best practices for YAML handling.

Deep Comparison with DeepDiff​

For complex nested YAML files, DeepDiff provides detailed change reports showing exactly what changed, where it changed, and how.

pip install deepdiff pyyaml

Consider the following files:

config_v1.yaml
database:
port: 5432
api:
timeout: 30
legacy:
old_setting: true
config_v2.yaml
database:
port: 5433
api:
timeout: 60
cache:
enabled: true

This is the script for finding exactly what changed, where it changed, and how:

import yaml
from deepdiff import DeepDiff

def compare_yaml_files(file1, file2):
"""Compare two YAML files and return detailed differences."""
with open(file1) as f1, open(file2) as f2:
data1 = yaml.safe_load(f1)
data2 = yaml.safe_load(f2)

# ignore_order=True treats [a, b] as equal to [b, a]
diff = DeepDiff(data1, data2, ignore_order=True)

return diff

# Usage
diff = compare_yaml_files('config_v1.yaml', 'config_v2.yaml')

if not diff:
print("Files are identical")
else:
print("Differences found:")
for change_type, changes in diff.items():
print(f"\n{change_type}:")
for change in changes:
print(f" {change}")

Example Output:

Differences found:

dictionary_item_added:
root['cache']

dictionary_item_removed:
root['legacy']

values_changed:
root['database']['port']
root['api']['timeout']
Always Use safe_load

Never use yaml.load() without specifying a safe loader. The default loader can execute arbitrary Python code embedded in YAML files, creating a severe security vulnerability.

# ā›”ļø DANGEROUS - Never do this
data = yaml.load(file)

# āœ… SAFE - Always use safe_load
data = yaml.safe_load(file)

Interpreting DeepDiff Results​

Consider the following files:

config_v1.yaml
database:
port: 5432
api:
timeout: 30
legacy:
old_setting: true
config_v2.yaml
database:
port: 5433
api:
timeout: 60
cache:
enabled: true

This is the script:

import yaml
from deepdiff import DeepDiff

def analyze_yaml_changes(file1, file2):
"""Analyze and categorize YAML changes."""
with open(file1) as f1, open(file2) as f2:
data1 = yaml.safe_load(f1)
data2 = yaml.safe_load(f2)

diff = DeepDiff(data1, data2, ignore_order=True)

report = {
'added': list(diff.get('dictionary_item_added', [])),
'removed': list(diff.get('dictionary_item_removed', [])),
'modified': [],
'type_changes': []
}

# Extract value changes with before/after
for path, change in diff.get('values_changed', {}).items():
report['modified'].append({
'path': path,
'old': change['old_value'],
'new': change['new_value']
})

# Type changes (e.g., string to integer)
for path, change in diff.get('type_changes', {}).items():
report['type_changes'].append({
'path': path,
'old_type': type(change['old_value']).__name__,
'new_type': type(change['new_value']).__name__
})

return report

# Usage
report = analyze_yaml_changes('config_v1.yaml', 'config_v2.yaml')
print(f"Added keys: {len(report['added'])}")
print(f"Removed keys: {len(report['removed'])}")
print(f"Modified values: {len(report['modified'])}")

Output:

Added keys: 1
Removed keys: 1
Modified values: 2

Quick Key Validation with Sets​

Consider the following file:

deployment.yaml
apiVersion: v1
kind: Pod
metadata:
name: example
docker:
containers: []

This is the script for simple validation of required top-level keys:

import yaml

def validate_required_keys(filepath, required_keys):
"""Check if YAML file contains all required keys."""
with open(filepath) as f:
data = yaml.safe_load(f)

if data is None:
return False, "Empty YAML file"

actual_keys = set(data.keys())
missing = required_keys - actual_keys
extra = actual_keys - required_keys

if missing:
return False, f"Missing required keys: {missing}"

return True, f"All required keys present. Extra keys: {extra}"

# Usage
required = {'apiVersion', 'kind', 'metadata', 'spec'}
valid, message = validate_required_keys('deployment.yaml', required)
print(message)

Output:

Missing required keys: {'spec'}

Nested Key Validation​

Consider the following files:

schema_v1.yaml
database:
port: 5432
api:
timeout: 30
legacy:
old_setting: true
schema_v2.yaml
database:
port: 5433
api:
timeout: 60
cache:
enabled: true

This is the script:

import yaml

def get_nested_keys(data, prefix=''):
"""Recursively extract all key paths from nested dict."""
keys = set()

if isinstance(data, dict):
for key, value in data.items():
full_path = f"{prefix}.{key}" if prefix else key
keys.add(full_path)
keys.update(get_nested_keys(value, full_path))
elif isinstance(data, list):
for i, item in enumerate(data):
keys.update(get_nested_keys(item, f"{prefix}[{i}]"))

return keys

def compare_yaml_keys(file1, file2):
"""Compare key structures between two YAML files."""
with open(file1) as f1, open(file2) as f2:
data1 = yaml.safe_load(f1)
data2 = yaml.safe_load(f2)

keys1 = get_nested_keys(data1)
keys2 = get_nested_keys(data2)

return {
'only_in_first': keys1 - keys2,
'only_in_second': keys2 - keys1,
'common': keys1 & keys2
}

# Usage
result = compare_yaml_keys('schema_v1.yaml', 'schema_v2.yaml')
print(f"Keys added: {result['only_in_second']}")
print(f"Keys removed: {result['only_in_first']}")

Output:

Keys added: {'app.config.timeout'}
Keys removed: set()

Comparing Multi-Document YAML Files​

YAML files can contain multiple documents separated by ---.

Consider the following files:

multi_v1.yaml
---
kind: Service
metadata:
name: svc1
---
kind: Deployment
metadata:
name: app1
multi_v2.yaml
---
kind: Service
metadata:
name: svc1
---
kind: Deployment
metadata:
name: app2

This is the script:

import yaml
from deepdiff import DeepDiff

def compare_multi_doc_yaml(file1, file2):
"""Compare YAML files with multiple documents."""
with open(file1) as f1, open(file2) as f2:
docs1 = list(yaml.safe_load_all(f1))
docs2 = list(yaml.safe_load_all(f2))

if len(docs1) != len(docs2):
return {
'document_count_changed': True,
'count_file1': len(docs1),
'count_file2': len(docs2)
}

differences = []
for i, (doc1, doc2) in enumerate(zip(docs1, docs2)):
diff = DeepDiff(doc1, doc2, ignore_order=True)
if diff:
differences.append({
'document_index': i,
'changes': diff
})

return {'differences': differences}

# usage
result = compare_multi_doc_yaml('multi_v1.yaml', 'multi_v2.yaml')

Ignoring Specific Keys​

Sometimes certain keys (like timestamps or generated IDs) should be excluded:

Consider the following files:

deploy_v1.yaml
metadata:
uid: abc123
resourceVersion: "1"
timestamp: "2025-01-01T10:00:00Z"
spec:
replicas: 2
deploy_v2.yaml
metadata:
uid: xyz999
resourceVersion: "2"
timestamp: "2025-02-01T10:00:00Z"
spec:
replicas: 3

This is the script:

import yaml
from deepdiff import DeepDiff

def compare_yaml_filtered(file1, file2, exclude_paths=None):
"""Compare YAML files while ignoring specified paths."""
exclude_paths = exclude_paths or []

with open(file1) as f1, open(file2) as f2:
data1 = yaml.safe_load(f1)
data2 = yaml.safe_load(f2)

# DeepDiff accepts regex patterns for exclusion
diff = DeepDiff(
data1, data2,
ignore_order=True,
exclude_paths=exclude_paths,
exclude_regex_paths=[r"root\['metadata'\]\['.*timestamp.*'\]"]
)

return diff

# Usage
diff = compare_yaml_filtered(
'deploy_v1.yaml',
'deploy_v2.yaml',
exclude_paths=["root['metadata']['uid']", "root['metadata']['resourceVersion']"]
)
print(diff)

Generating Human-Readable Reports​

Consider the following files:

config_v1.yaml
database:
port: 5432
api:
timeout: 30
legacy:
old_setting: true
config_v2.yaml
database:
port: 5433
api:
timeout: 60
cache:
enabled: true

The following is an example of function to make reports clear and understandable:

import yaml
from deepdiff import DeepDiff

def generate_yaml_diff_report(file1, file2):
"""Generate a formatted diff report."""
with open(file1) as f1, open(file2) as f2:
data1 = yaml.safe_load(f1)
data2 = yaml.safe_load(f2)

diff = DeepDiff(data1, data2, ignore_order=True, verbose_level=2)

lines = [f"Comparing {file1} -> {file2}", "=" * 50]

if not diff:
lines.append("No differences found.")
return "\n".join(lines)

if 'dictionary_item_added' in diff:
lines.append("\n🟢 ADDED:")
for item in diff['dictionary_item_added']:
lines.append(f" + {item}")

if 'dictionary_item_removed' in diff:
lines.append("\nšŸ”“ REMOVED:")
for item in diff['dictionary_item_removed']:
lines.append(f" - {item}")

if 'values_changed' in diff:
lines.append("\n🟔 MODIFIED:")
for path, change in diff['values_changed'].items():
lines.append(f" {path}:")
lines.append(f" old: {change['old_value']}")
lines.append(f" new: {change['new_value']}")

return "\n".join(lines)

# Usage
report = generate_yaml_diff_report('config_1.yaml', 'config_2.yaml')
print(report)

Output:

🟢 ADDED:
+ root['cache']

šŸ”“ REMOVED:
- root['legacy']

🟔 MODIFIED:
root['database']['port']:
old: 5432
new: 5433
root['api']['timeout']:
old: 30
new: 60

Summary​

ScenarioToolFeatures
Full comparisonDeepDiffDetects value changes, additions, removals in nested structures
Key validationset() operationsFast check for required/missing keys
Structure comparisonRecursive key extractionCompare schemas without values
Multi-documentyaml.safe_load_all()Handle --- separated documents
Best Practice

Use DeepDiff with ignore_order=True for comparing configuration files. It provides actionable details like "Port changed from 80 to 8080" rather than just "files differ." For simple key presence checks, set operations are faster and require no additional dependencies.