Skip to main content

How to Compare Two XML Files in Python

Comparing XML documents requires special handling because structurally identical XML can have different text representations. Whitespace variations, attribute ordering, and formatting differences are all semantically meaningless but break simple text comparison.

This guide covers proper XML comparison techniques using both third-party libraries and Python's standard library.

Why Text Comparison Fails for XML

Standard string or line-by-line comparison produces false positives with XML:

# These are semantically IDENTICAL:
xml1 = '<user id="1" name="Alice"/>'
xml2 = '<user name="Alice" id="1"/>'

# But text comparison says they differ
print(xml1 == xml2) # False (WRONG answer)
Common Mistakes

Never use open(f).read() == open(f2).read() or difflib on raw XML files. These approaches fail because:

  • Whitespace: Pretty-printed and minified XML are functionally equivalent
  • Attribute order: x="1" y="2" equals y="2" x="1" in XML semantics
  • Quote styles: Single and double quotes are interchangeable

Config Files for the examples in the following sections

config1.xml
<?xml version="1.0" encoding="UTF-8"?>
<config>
<database>
<host>localhost</host>
<port>5432</port>
<username>admin</username>
</database>

<server>
<host>127.0.0.1</host>
<port>8080</port>
</server>

<logging level="INFO"/>
</config>
config2.xml
<?xml version="1.0" encoding="UTF-8"?>
<config>
<database>
<host>localhost</host>
<port>5433</port>
<username>admin</username>
</database>

<server>
<host>127.0.0.1</host>
<port>9090</port>
</server>

<logging level="DEBUG"/>

<featureFlags>
<feature name="betaMode" enabled="true"/>
</featureFlags>
</config>

Semantic Comparison with xmldiff

The xmldiff library understands XML structure and reports meaningful differences:

pip install xmldiff
from xmldiff import main, formatting

def compare_xml_semantic(file1, file2):
"""Compare XML files and return structural differences."""
diffs = main.diff_files(file1, file2)

if not diffs:
print("XML documents are semantically identical")
return True

print(f"Found {len(diffs)} differences:")
for diff in diffs:
print(f" {diff}")

return False

# Usage
compare_xml_semantic('config1.xml', 'config2.xml')

Example Output:

Found 7 differences:
UpdateAttrib(node='/config/logging[1]', name='level', value='DEBUG')
InsertNode(target='/config[1]', tag='featureFlags', position=3)
UpdateTextIn(node='/config/database/port[1]', text='5433')
UpdateTextIn(node='/config/server/port[1]', text='9090')
InsertNode(target='/config/featureFlags[1]', tag='feature', position=0)
InsertAttrib(node='/config/featureFlags/feature[1]', name='enabled', value='true')
InsertAttrib(node='/config/featureFlags/feature[1]', name='name', value='betaMode')

Getting Human-Readable Diff Output

from xmldiff import main, formatting

def get_xml_diff_report(file1, file2):
"""Generate formatted diff report."""
formatter = formatting.DiffFormatter()
diff = main.diff_files(
file1, file2,
formatter=formatter
)
return diff

# Usage
get_xml_diff_report('config1.xml', 'config2.xml')
[update-attribute, /config/logging[1], level, "DEBUG"]
[insert, /config[1], featureFlags, 3]
[update-text, /config/database/port[1], "5433"]
[update-text, /config/server/port[1], "9090"]
[insert, /config/featureFlags[1], feature, 0]
[insert-attribute, /config/featureFlags/feature[1], enabled, "true"]
[insert-attribute, /config/featureFlags/feature[1], name, "betaMode"]
# For XML output showing changes
from xmldiff import main, formatting

def get_annotated_diff(file1, file2):
"""Get diff as annotated XML."""
formatter = formatting.XMLFormatter(
normalize=formatting.WS_BOTH
)
diff = main.diff_files(file1, file2, formatter=formatter)
return diff

# Usage
get_annotated_diff('config1.xml', 'config2.xml')
<config xmlns:diff="http://namespaces.shoobx.com/diff">
<database>
<host>localhost</host>
<port>543<diff:delete>2</diff:delete><diff:insert>3</diff:insert></port>
<username>admin</username>
</database>
<server>
<host>127.0.0.1</host>
<port><diff:delete>808</diff:delete><diff:insert>909</diff:insert>0</port>
</server>
<logging level="DEBUG" diff:update-attr="level:INFO"/>
<featureFlags diff:insert="">
<feature diff:insert="" enabled="true" diff:add-attr="enabled;name" name="betaMode"/>
</featureFlags>
</config>

Canonicalization (Standard Library)

When external packages aren't available, use XML Canonicalization (C14N) to normalize documents before comparison:

import xml.etree.ElementTree as ET
import io
import os


def canonicalize_xml(filepath):
"""Convert XML to canonical form (Python 3.9 safe)."""

if not os.path.exists(filepath):
raise FileNotFoundError(f"{filepath} not found")

output = io.StringIO()

ET.canonicalize(
from_file=filepath,
out=output,
with_comments=False
)

return output.getvalue().encode("utf-8") # convert to bytes for safe compare


def compare_xml_canonical(file1, file2):
canon1 = canonicalize_xml(file1)
canon2 = canonicalize_xml(file2)

if canon1 == canon2:
print("Documents are identical (canonical form).")
return True
else:
print("Documents differ (canonical form).")
return False


compare_xml_canonical('config1.xml', 'config2.xml')

Output:

Documents differ (canonical form).
What Canonicalization Does

C14N standardizes XML by:

  • Sorting attributes alphabetically
  • Normalizing whitespace in specific ways
  • Using consistent quote characters
  • Expanding empty elements (<br/><br></br>)

Canonicalizing from Strings

import xml.etree.ElementTree as ET
import io

def canonicalize_string(xml_string):
"""Canonicalize XML from string input."""
output = io.StringIO()
ET.canonicalize(xml_data=xml_string, out=output)
return output.getvalue()

xml1 = '<root attr2="b" attr1="a"><child/></root>'
xml2 = '<root attr1="a" attr2="b"><child></child></root>'

# Both produce identical canonical output
print(canonicalize_string(xml1) == canonicalize_string(xml2)) # True

Element-by-Element Comparison

For custom comparison logic or detailed reporting:

import xml.etree.ElementTree as ET

def elements_equal(e1, e2):
"""Recursively compare two XML elements."""
# Compare tags
if e1.tag != e2.tag:
return False, f"Tag mismatch: {e1.tag} vs {e2.tag}"

# Compare attributes
if dict(e1.attrib) != dict(e2.attrib):
return False, f"Attribute mismatch in <{e1.tag}>"

# Compare text content (normalized)
text1 = (e1.text or '').strip()
text2 = (e2.text or '').strip()
if text1 != text2:
return False, f"Text mismatch in <{e1.tag}>: '{text1}' vs '{text2}'"

# Compare children
children1 = list(e1)
children2 = list(e2)

if len(children1) != len(children2):
return False, f"Child count mismatch in <{e1.tag}>"

for c1, c2 in zip(children1, children2):
equal, msg = elements_equal(c1, c2)
if not equal:
return False, msg

return True, "Elements are equal"

def compare_xml_elements(file1, file2):
"""Compare two XML files element by element."""
tree1 = ET.parse(file1)
tree2 = ET.parse(file2)

return elements_equal(tree1.getroot(), tree2.getroot())

# Usage
is_equal, message = compare_xml_elements('config1.xml', 'config2.xml')
print(message)

Ignoring Specific Elements or Attributes

Sometimes certain fields (like timestamps) should be excluded:

import xml.etree.ElementTree as ET
import copy

def remove_elements(root, xpath_list):
"""Remove specified elements before comparison."""
root = copy.deepcopy(root)

for xpath in xpath_list:
for elem in root.findall(xpath):
parent = root.find(f".//{elem.tag}/..")
if parent is not None:
parent.remove(elem)

return root

def compare_xml_filtered(file1, file2, ignore_tags=None):
"""Compare XML files, ignoring specified elements."""
ignore_tags = ignore_tags or []

tree1 = ET.parse(file1)
tree2 = ET.parse(file2)

root1 = tree1.getroot()
root2 = tree2.getroot()

# Remove ignored elements
for tag in ignore_tags:
for elem in root1.findall(f".//{tag}"):
elem.getparent().remove(elem)
for elem in root2.findall(f".//{tag}"):
elem.getparent().remove(elem)

# Compare using canonicalization
output1 = io.BytesIO()
output2 = io.BytesIO()

ET.canonicalize(xml_data=ET.tostring(root1), out=output1)
ET.canonicalize(xml_data=ET.tostring(root2), out=output2)

return output1.getvalue() == output2.getvalue()

Comparing with lxml (Advanced Features)

For complex XML with namespaces or schemas:

pip install lxml
from lxml import etree

def compare_with_lxml(file1, file2):
"""Compare XML using lxml's robust parsing."""
parser = etree.XMLParser(remove_blank_text=True)

tree1 = etree.parse(file1, parser)
tree2 = etree.parse(file2, parser)

# Canonical comparison
canon1 = etree.tostring(tree1, method='c14n')
canon2 = etree.tostring(tree2, method='c14n')

return canon1 == canon2

def get_xpath_differences(file1, file2):
"""Find differences and report XPath locations."""
tree1 = etree.parse(file1)
tree2 = etree.parse(file2)

differences = []

def compare_nodes(node1, node2, path=""):
current_path = f"{path}/{node1.tag}"

if node1.text != node2.text:
differences.append({
'xpath': current_path,
'type': 'text',
'old': node1.text,
'new': node2.text
})

# Continue recursively...

compare_nodes(tree1.getroot(), tree2.getroot())
return differences

Summary

MethodPackageIgnores WhitespaceIgnores Attr OrderBest For
xmldiffThird-partyYesYesDetailed change reports
CanonicalizationStandard libYesYesSimple equality checks
Element recursionStandard libConfigurableYesCustom comparison logic
lxmlThird-partyYesYesComplex XML, namespaces
Best Practice

For strict data validation, use Canonicalization (ET.canonicalize) from the standard library since it's fast, correct, and requires no dependencies. For generating human-readable diff reports or tracking specific changes, use xmldiff which provides detailed edit operations.