How to Compare Two XML Files in Python
Comparing XML documents requires special handling because structurally identical XML can have different text representations. Whitespace variations, attribute ordering, and formatting differences are all semantically meaningless but break simple text comparison.
This guide covers proper XML comparison techniques using both third-party libraries and Python's standard library.
Why Text Comparison Fails for XML
Standard string or line-by-line comparison produces false positives with XML:
# These are semantically IDENTICAL:
xml1 = '<user id="1" name="Alice"/>'
xml2 = '<user name="Alice" id="1"/>'
# But text comparison says they differ
print(xml1 == xml2) # False (WRONG answer)
Never use open(f).read() == open(f2).read() or difflib on raw XML files. These approaches fail because:
- Whitespace: Pretty-printed and minified XML are functionally equivalent
- Attribute order:
x="1" y="2"equalsy="2" x="1"in XML semantics - Quote styles: Single and double quotes are interchangeable
Config Files for the examples in the following sections
<?xml version="1.0" encoding="UTF-8"?>
<config>
<database>
<host>localhost</host>
<port>5432</port>
<username>admin</username>
</database>
<server>
<host>127.0.0.1</host>
<port>8080</port>
</server>
<logging level="INFO"/>
</config>
<?xml version="1.0" encoding="UTF-8"?>
<config>
<database>
<host>localhost</host>
<port>5433</port>
<username>admin</username>
</database>
<server>
<host>127.0.0.1</host>
<port>9090</port>
</server>
<logging level="DEBUG"/>
<featureFlags>
<feature name="betaMode" enabled="true"/>
</featureFlags>
</config>
Semantic Comparison with xmldiff
The xmldiff library understands XML structure and reports meaningful differences:
pip install xmldiff
from xmldiff import main, formatting
def compare_xml_semantic(file1, file2):
"""Compare XML files and return structural differences."""
diffs = main.diff_files(file1, file2)
if not diffs:
print("XML documents are semantically identical")
return True
print(f"Found {len(diffs)} differences:")
for diff in diffs:
print(f" {diff}")
return False
# Usage
compare_xml_semantic('config1.xml', 'config2.xml')
Example Output:
Found 7 differences:
UpdateAttrib(node='/config/logging[1]', name='level', value='DEBUG')
InsertNode(target='/config[1]', tag='featureFlags', position=3)
UpdateTextIn(node='/config/database/port[1]', text='5433')
UpdateTextIn(node='/config/server/port[1]', text='9090')
InsertNode(target='/config/featureFlags[1]', tag='feature', position=0)
InsertAttrib(node='/config/featureFlags/feature[1]', name='enabled', value='true')
InsertAttrib(node='/config/featureFlags/feature[1]', name='name', value='betaMode')
Getting Human-Readable Diff Output
from xmldiff import main, formatting
def get_xml_diff_report(file1, file2):
"""Generate formatted diff report."""
formatter = formatting.DiffFormatter()
diff = main.diff_files(
file1, file2,
formatter=formatter
)
return diff
# Usage
get_xml_diff_report('config1.xml', 'config2.xml')
[update-attribute, /config/logging[1], level, "DEBUG"]
[insert, /config[1], featureFlags, 3]
[update-text, /config/database/port[1], "5433"]
[update-text, /config/server/port[1], "9090"]
[insert, /config/featureFlags[1], feature, 0]
[insert-attribute, /config/featureFlags/feature[1], enabled, "true"]
[insert-attribute, /config/featureFlags/feature[1], name, "betaMode"]
# For XML output showing changes
from xmldiff import main, formatting
def get_annotated_diff(file1, file2):
"""Get diff as annotated XML."""
formatter = formatting.XMLFormatter(
normalize=formatting.WS_BOTH
)
diff = main.diff_files(file1, file2, formatter=formatter)
return diff
# Usage
get_annotated_diff('config1.xml', 'config2.xml')
<config xmlns:diff="http://namespaces.shoobx.com/diff">
<database>
<host>localhost</host>
<port>543<diff:delete>2</diff:delete><diff:insert>3</diff:insert></port>
<username>admin</username>
</database>
<server>
<host>127.0.0.1</host>
<port><diff:delete>808</diff:delete><diff:insert>909</diff:insert>0</port>
</server>
<logging level="DEBUG" diff:update-attr="level:INFO"/>
<featureFlags diff:insert="">
<feature diff:insert="" enabled="true" diff:add-attr="enabled;name" name="betaMode"/>
</featureFlags>
</config>
Canonicalization (Standard Library)
When external packages aren't available, use XML Canonicalization (C14N) to normalize documents before comparison:
import xml.etree.ElementTree as ET
import io
import os
def canonicalize_xml(filepath):
"""Convert XML to canonical form (Python 3.9 safe)."""
if not os.path.exists(filepath):
raise FileNotFoundError(f"{filepath} not found")
output = io.StringIO()
ET.canonicalize(
from_file=filepath,
out=output,
with_comments=False
)
return output.getvalue().encode("utf-8") # convert to bytes for safe compare
def compare_xml_canonical(file1, file2):
canon1 = canonicalize_xml(file1)
canon2 = canonicalize_xml(file2)
if canon1 == canon2:
print("Documents are identical (canonical form).")
return True
else:
print("Documents differ (canonical form).")
return False
compare_xml_canonical('config1.xml', 'config2.xml')
Output:
Documents differ (canonical form).
C14N standardizes XML by:
- Sorting attributes alphabetically
- Normalizing whitespace in specific ways
- Using consistent quote characters
- Expanding empty elements (
<br/>→<br></br>)
Canonicalizing from Strings
import xml.etree.ElementTree as ET
import io
def canonicalize_string(xml_string):
"""Canonicalize XML from string input."""
output = io.StringIO()
ET.canonicalize(xml_data=xml_string, out=output)
return output.getvalue()
xml1 = '<root attr2="b" attr1="a"><child/></root>'
xml2 = '<root attr1="a" attr2="b"><child></child></root>'
# Both produce identical canonical output
print(canonicalize_string(xml1) == canonicalize_string(xml2)) # True
Element-by-Element Comparison
For custom comparison logic or detailed reporting:
import xml.etree.ElementTree as ET
def elements_equal(e1, e2):
"""Recursively compare two XML elements."""
# Compare tags
if e1.tag != e2.tag:
return False, f"Tag mismatch: {e1.tag} vs {e2.tag}"
# Compare attributes
if dict(e1.attrib) != dict(e2.attrib):
return False, f"Attribute mismatch in <{e1.tag}>"
# Compare text content (normalized)
text1 = (e1.text or '').strip()
text2 = (e2.text or '').strip()
if text1 != text2:
return False, f"Text mismatch in <{e1.tag}>: '{text1}' vs '{text2}'"
# Compare children
children1 = list(e1)
children2 = list(e2)
if len(children1) != len(children2):
return False, f"Child count mismatch in <{e1.tag}>"
for c1, c2 in zip(children1, children2):
equal, msg = elements_equal(c1, c2)
if not equal:
return False, msg
return True, "Elements are equal"
def compare_xml_elements(file1, file2):
"""Compare two XML files element by element."""
tree1 = ET.parse(file1)
tree2 = ET.parse(file2)
return elements_equal(tree1.getroot(), tree2.getroot())
# Usage
is_equal, message = compare_xml_elements('config1.xml', 'config2.xml')
print(message)
Ignoring Specific Elements or Attributes
Sometimes certain fields (like timestamps) should be excluded:
import xml.etree.ElementTree as ET
import copy
def remove_elements(root, xpath_list):
"""Remove specified elements before comparison."""
root = copy.deepcopy(root)
for xpath in xpath_list:
for elem in root.findall(xpath):
parent = root.find(f".//{elem.tag}/..")
if parent is not None:
parent.remove(elem)
return root
def compare_xml_filtered(file1, file2, ignore_tags=None):
"""Compare XML files, ignoring specified elements."""
ignore_tags = ignore_tags or []
tree1 = ET.parse(file1)
tree2 = ET.parse(file2)
root1 = tree1.getroot()
root2 = tree2.getroot()
# Remove ignored elements
for tag in ignore_tags:
for elem in root1.findall(f".//{tag}"):
elem.getparent().remove(elem)
for elem in root2.findall(f".//{tag}"):
elem.getparent().remove(elem)
# Compare using canonicalization
output1 = io.BytesIO()
output2 = io.BytesIO()
ET.canonicalize(xml_data=ET.tostring(root1), out=output1)
ET.canonicalize(xml_data=ET.tostring(root2), out=output2)
return output1.getvalue() == output2.getvalue()
Comparing with lxml (Advanced Features)
For complex XML with namespaces or schemas:
pip install lxml
from lxml import etree
def compare_with_lxml(file1, file2):
"""Compare XML using lxml's robust parsing."""
parser = etree.XMLParser(remove_blank_text=True)
tree1 = etree.parse(file1, parser)
tree2 = etree.parse(file2, parser)
# Canonical comparison
canon1 = etree.tostring(tree1, method='c14n')
canon2 = etree.tostring(tree2, method='c14n')
return canon1 == canon2
def get_xpath_differences(file1, file2):
"""Find differences and report XPath locations."""
tree1 = etree.parse(file1)
tree2 = etree.parse(file2)
differences = []
def compare_nodes(node1, node2, path=""):
current_path = f"{path}/{node1.tag}"
if node1.text != node2.text:
differences.append({
'xpath': current_path,
'type': 'text',
'old': node1.text,
'new': node2.text
})
# Continue recursively...
compare_nodes(tree1.getroot(), tree2.getroot())
return differences
Summary
| Method | Package | Ignores Whitespace | Ignores Attr Order | Best For |
|---|---|---|---|---|
| xmldiff | Third-party | Yes | Yes | Detailed change reports |
| Canonicalization | Standard lib | Yes | Yes | Simple equality checks |
| Element recursion | Standard lib | Configurable | Yes | Custom comparison logic |
| lxml | Third-party | Yes | Yes | Complex XML, namespaces |
For strict data validation, use Canonicalization (ET.canonicalize) from the standard library since it's fast, correct, and requires no dependencies. For generating human-readable diff reports or tracking specific changes, use xmldiff which provides detailed edit operations.