Skip to main content

How to Group List to Dictionary by Prefix in Python

When data arrives as a flat stream where headers are followed by values (common in log files, legacy exports, or configuration formats), you need stateful iteration to group items correctly.

This guide covers robust patterns for transforming flat lists into structured dictionaries.

Track the current group key as you iterate:

raw_data = [
"GROUP-A", 10, 20,
"GROUP-B", 5,
"GROUP-C", 99, 100, 101
]

def group_by_prefix(data, prefix="GROUP-"):
result = {}
current_key = None

for item in data:
# Check if item is a header
if isinstance(item, str) and item.startswith(prefix):
current_key = item
result[current_key] = []
# Otherwise, append to current group
elif current_key is not None:
result[current_key].append(item)

return result

print(group_by_prefix(raw_data))
# {'GROUP-A': [10, 20], 'GROUP-B': [5], 'GROUP-C': [99, 100, 101]}

Handling Values Before First Header

def group_by_prefix_safe(data, prefix="GROUP-", orphan_key="_orphaned"):
"""Handle items that appear before any header."""
result = {orphan_key: []}
current_key = orphan_key

for item in data:
if isinstance(item, str) and item.startswith(prefix):
current_key = item
result[current_key] = []
else:
result[current_key].append(item)

# Remove orphan key if empty
if not result[orphan_key]:
del result[orphan_key]

return result

# Data with orphaned values at start
data = [1, 2, "GROUP-A", 10, 20, "GROUP-B", 30]
print(group_by_prefix_safe(data))
# {'_orphaned': [1, 2], 'GROUP-A': [10, 20], 'GROUP-B': [30]}

Using a Callable Header Detector

Make the grouping logic more flexible:

def group_by_condition(data, is_header):
"""Group items based on a custom header detection function."""
result = {}
current_key = None

for item in data:
if is_header(item):
current_key = item
result[current_key] = []
elif current_key is not None:
result[current_key].append(item)

return result

# Example: Headers are uppercase strings
data = ["SECTION_A", 1, 2, 3, "SECTION_B", 4, 5]
result = group_by_condition(data, lambda x: isinstance(x, str) and x.isupper())
print(result)
# {'SECTION_A': [1, 2, 3], 'SECTION_B': [4, 5]}

# Example: Headers start with '#'
data = ["# Config", "key=value", "# Database", "host=localhost"]
result = group_by_condition(data, lambda x: x.startswith("#"))
print(result)
# {'# Config': ['key=value'], '# Database': ['host=localhost']}

Output:

{'SECTION_A': [1, 2, 3], 'SECTION_B': [4, 5]}
{'# Config': ['key=value'], '# Database': ['host=localhost']}

Using itertools.groupby

For data with clear header/value markers:

from itertools import groupby

data = [
("H", "Users"),
("V", "Alice"),
("V", "Bob"),
("H", "Admins"),
("V", "Charlie")
]

def group_with_markers(data):
result = {}
current_key = None

for is_header, group in groupby(data, key=lambda x: x[0] == "H"):
items = list(group)
if is_header:
# First item of header group contains the key
current_key = items[0][1]
result[current_key] = []
else:
# Extend with values
result[current_key].extend(item[1] for item in items)

return result

print(group_with_markers(data))
# {'Users': ['Alice', 'Bob'], 'Admins': ['Charlie']}
groupby Limitation

itertools.groupby groups consecutive items. If your data isn't sorted by header, values may end up in wrong groups. The state loop method is safer for unpredictable data.

Practical Examples

Log File Parsing

log_lines = [
"[ERROR] Connection failed",
" Timeout after 30s",
" Retry count: 3",
"[WARNING] Disk space low",
" Available: 500MB",
"[ERROR] Authentication failed",
" User: admin",
" IP: 192.168.1.100"
]

def parse_log_entries(lines):
entries = []
current_entry = None

for line in lines:
if line.startswith("["):
if current_entry:
entries.append(current_entry)
# Extract level and message
bracket_end = line.index("]")
level = line[1:bracket_end]
message = line[bracket_end + 2:]
current_entry = {"level": level, "message": message, "details": []}
elif current_entry:
current_entry["details"].append(line.strip())

if current_entry:
entries.append(current_entry)

return entries

parsed = parse_log_entries(log_lines)
for entry in parsed:
print(entry)

Output:

{'level': 'ERROR', 'message': 'Connection failed', 'details': ['Timeout after 30s', 'Retry count: 3']}
{'level': 'WARNING', 'message': 'Disk space low', 'details': ['Available: 500MB']}
{'level': 'ERROR', 'message': 'Authentication failed', 'details': ['User: admin', 'IP: 192.168.1.100']}

Configuration Section Grouping

config_data = [
"[database]",
"host=localhost",
"port=5432",
"[cache]",
"enabled=true",
"ttl=3600",
"[logging]",
"level=debug"
]

def parse_ini_style(lines):
sections = {}
current_section = None

for line in lines:
if line.startswith("[") and line.endswith("]"):
current_section = line[1:-1]
sections[current_section] = {}
elif current_section and "=" in line:
key, value = line.split("=", 1)
sections[current_section][key] = value

return sections

print(parse_ini_style(config_data))

Output:

{'database': {'host': 'localhost', 'port': '5432'}, 'cache': {'enabled': 'true', 'ttl': '3600'}, 'logging': {'level': 'debug'}}

Numbered Section Grouping

data = [
"1.", "First item", "First detail",
"2.", "Second item",
"3.", "Third item", "Detail A", "Detail B"
]

import re

def group_numbered_sections(items):
result = {}
current = None

for item in items:
if re.match(r"^\d+\.$", item):
current = item
result[current] = []
elif current:
result[current].append(item)

return result

print(group_numbered_sections(data))

Output:

{'1.': ['First item', 'First detail'], '2.': ['Second item'], '3.': ['Third item', 'Detail A', 'Detail B']}

Using defaultdict for Cleaner Code

from collections import defaultdict

def group_by_prefix_defaultdict(data, prefix):
result = defaultdict(list)
current_key = None

for item in data:
if isinstance(item, str) and item.startswith(prefix):
current_key = item
elif current_key:
result[current_key].append(item)

return dict(result)

data = ["CAT-A", 1, 2, "CAT-B", 3, 4, 5]
print(group_by_prefix_defaultdict(data, "CAT-"))

Output:

{'CAT-A': [1, 2], 'CAT-B': [3, 4, 5]}

Transforming Values During Grouping

def group_and_transform(data, is_header, transform=None):
"""Group with optional value transformation."""
result = {}
current_key = None

for item in data:
if is_header(item):
current_key = item
result[current_key] = []
elif current_key is not None:
value = transform(item) if transform else item
result[current_key].append(value)

return result

# Example: Parse string values to integers
data = ["GROUP-A", "10", "20", "GROUP-B", "30"]
result = group_and_transform(
data,
is_header=lambda x: x.startswith("GROUP-"),
transform=int
)
print(result)

Output:

{'GROUP-A': [10, 20], 'GROUP-B': [30]}

Summary

MethodReadabilityFlexibilityBest For
State loopHighHighGeneral-purpose, messy data
With callableHighVery highConfigurable header detection
groupbyMediumMediumClean, predictable formats
defaultdictHighMediumCleaner initialization
Best Practice

Use the state loop method for most real-world data. It is explicit, easy to debug, and handles edge cases like orphaned values naturally. Reserve itertools.groupby for already-sorted data with clear markers.