How to Group List to Dictionary by Prefix in Python
When data arrives as a flat stream where headers are followed by values (common in log files, legacy exports, or configuration formats), you need stateful iteration to group items correctly.
This guide covers robust patterns for transforming flat lists into structured dictionaries.
State Loop Method (Recommended)
Track the current group key as you iterate:
raw_data = [
"GROUP-A", 10, 20,
"GROUP-B", 5,
"GROUP-C", 99, 100, 101
]
def group_by_prefix(data, prefix="GROUP-"):
result = {}
current_key = None
for item in data:
# Check if item is a header
if isinstance(item, str) and item.startswith(prefix):
current_key = item
result[current_key] = []
# Otherwise, append to current group
elif current_key is not None:
result[current_key].append(item)
return result
print(group_by_prefix(raw_data))
# {'GROUP-A': [10, 20], 'GROUP-B': [5], 'GROUP-C': [99, 100, 101]}
Handling Values Before First Header
def group_by_prefix_safe(data, prefix="GROUP-", orphan_key="_orphaned"):
"""Handle items that appear before any header."""
result = {orphan_key: []}
current_key = orphan_key
for item in data:
if isinstance(item, str) and item.startswith(prefix):
current_key = item
result[current_key] = []
else:
result[current_key].append(item)
# Remove orphan key if empty
if not result[orphan_key]:
del result[orphan_key]
return result
# Data with orphaned values at start
data = [1, 2, "GROUP-A", 10, 20, "GROUP-B", 30]
print(group_by_prefix_safe(data))
# {'_orphaned': [1, 2], 'GROUP-A': [10, 20], 'GROUP-B': [30]}
Using a Callable Header Detector
Make the grouping logic more flexible:
def group_by_condition(data, is_header):
"""Group items based on a custom header detection function."""
result = {}
current_key = None
for item in data:
if is_header(item):
current_key = item
result[current_key] = []
elif current_key is not None:
result[current_key].append(item)
return result
# Example: Headers are uppercase strings
data = ["SECTION_A", 1, 2, 3, "SECTION_B", 4, 5]
result = group_by_condition(data, lambda x: isinstance(x, str) and x.isupper())
print(result)
# {'SECTION_A': [1, 2, 3], 'SECTION_B': [4, 5]}
# Example: Headers start with '#'
data = ["# Config", "key=value", "# Database", "host=localhost"]
result = group_by_condition(data, lambda x: x.startswith("#"))
print(result)
# {'# Config': ['key=value'], '# Database': ['host=localhost']}
Output:
{'SECTION_A': [1, 2, 3], 'SECTION_B': [4, 5]}
{'# Config': ['key=value'], '# Database': ['host=localhost']}
Using itertools.groupby
For data with clear header/value markers:
from itertools import groupby
data = [
("H", "Users"),
("V", "Alice"),
("V", "Bob"),
("H", "Admins"),
("V", "Charlie")
]
def group_with_markers(data):
result = {}
current_key = None
for is_header, group in groupby(data, key=lambda x: x[0] == "H"):
items = list(group)
if is_header:
# First item of header group contains the key
current_key = items[0][1]
result[current_key] = []
else:
# Extend with values
result[current_key].extend(item[1] for item in items)
return result
print(group_with_markers(data))
# {'Users': ['Alice', 'Bob'], 'Admins': ['Charlie']}
itertools.groupby groups consecutive items. If your data isn't sorted by header, values may end up in wrong groups. The state loop method is safer for unpredictable data.
Practical Examples
Log File Parsing
log_lines = [
"[ERROR] Connection failed",
" Timeout after 30s",
" Retry count: 3",
"[WARNING] Disk space low",
" Available: 500MB",
"[ERROR] Authentication failed",
" User: admin",
" IP: 192.168.1.100"
]
def parse_log_entries(lines):
entries = []
current_entry = None
for line in lines:
if line.startswith("["):
if current_entry:
entries.append(current_entry)
# Extract level and message
bracket_end = line.index("]")
level = line[1:bracket_end]
message = line[bracket_end + 2:]
current_entry = {"level": level, "message": message, "details": []}
elif current_entry:
current_entry["details"].append(line.strip())
if current_entry:
entries.append(current_entry)
return entries
parsed = parse_log_entries(log_lines)
for entry in parsed:
print(entry)
Output:
{'level': 'ERROR', 'message': 'Connection failed', 'details': ['Timeout after 30s', 'Retry count: 3']}
{'level': 'WARNING', 'message': 'Disk space low', 'details': ['Available: 500MB']}
{'level': 'ERROR', 'message': 'Authentication failed', 'details': ['User: admin', 'IP: 192.168.1.100']}
Configuration Section Grouping
config_data = [
"[database]",
"host=localhost",
"port=5432",
"[cache]",
"enabled=true",
"ttl=3600",
"[logging]",
"level=debug"
]
def parse_ini_style(lines):
sections = {}
current_section = None
for line in lines:
if line.startswith("[") and line.endswith("]"):
current_section = line[1:-1]
sections[current_section] = {}
elif current_section and "=" in line:
key, value = line.split("=", 1)
sections[current_section][key] = value
return sections
print(parse_ini_style(config_data))
Output:
{'database': {'host': 'localhost', 'port': '5432'}, 'cache': {'enabled': 'true', 'ttl': '3600'}, 'logging': {'level': 'debug'}}
Numbered Section Grouping
data = [
"1.", "First item", "First detail",
"2.", "Second item",
"3.", "Third item", "Detail A", "Detail B"
]
import re
def group_numbered_sections(items):
result = {}
current = None
for item in items:
if re.match(r"^\d+\.$", item):
current = item
result[current] = []
elif current:
result[current].append(item)
return result
print(group_numbered_sections(data))
Output:
{'1.': ['First item', 'First detail'], '2.': ['Second item'], '3.': ['Third item', 'Detail A', 'Detail B']}
Using defaultdict for Cleaner Code
from collections import defaultdict
def group_by_prefix_defaultdict(data, prefix):
result = defaultdict(list)
current_key = None
for item in data:
if isinstance(item, str) and item.startswith(prefix):
current_key = item
elif current_key:
result[current_key].append(item)
return dict(result)
data = ["CAT-A", 1, 2, "CAT-B", 3, 4, 5]
print(group_by_prefix_defaultdict(data, "CAT-"))
Output:
{'CAT-A': [1, 2], 'CAT-B': [3, 4, 5]}
Transforming Values During Grouping
def group_and_transform(data, is_header, transform=None):
"""Group with optional value transformation."""
result = {}
current_key = None
for item in data:
if is_header(item):
current_key = item
result[current_key] = []
elif current_key is not None:
value = transform(item) if transform else item
result[current_key].append(value)
return result
# Example: Parse string values to integers
data = ["GROUP-A", "10", "20", "GROUP-B", "30"]
result = group_and_transform(
data,
is_header=lambda x: x.startswith("GROUP-"),
transform=int
)
print(result)
Output:
{'GROUP-A': [10, 20], 'GROUP-B': [30]}
Summary
| Method | Readability | Flexibility | Best For |
|---|---|---|---|
| State loop | High | High | General-purpose, messy data |
| With callable | High | Very high | Configurable header detection |
| groupby | Medium | Medium | Clean, predictable formats |
| defaultdict | High | Medium | Cleaner initialization |
Use the state loop method for most real-world data. It is explicit, easy to debug, and handles edge cases like orphaned values naturally. Reserve itertools.groupby for already-sorted data with clear markers.