How to Group List to Dictionary by Prefix in Python

When data arrives as a flat stream where headers are followed by values (common in log files, legacy exports, or configuration formats), you need stateful iteration to group items correctly.

This guide covers robust patterns for transforming flat lists into structured dictionaries.

State Loop Method (Recommended)

Track the current group key as you iterate:

raw_data = [
    "GROUP-A", 10, 20,
    "GROUP-B", 5,
    "GROUP-C", 99, 100, 101
]

def group_by_prefix(data, prefix="GROUP-"):
    result = {}
    current_key = None
    
    for item in data:
        # Check if item is a header
        if isinstance(item, str) and item.startswith(prefix):
            current_key = item
            result[current_key] = []
        # Otherwise, append to current group
        elif current_key is not None:
            result[current_key].append(item)
    
    return result

print(group_by_prefix(raw_data))
# {'GROUP-A': [10, 20], 'GROUP-B': [5], 'GROUP-C': [99, 100, 101]}

Handling Values Before First Header

def group_by_prefix_safe(data, prefix="GROUP-", orphan_key="_orphaned"):
    """Handle items that appear before any header."""
    result = {orphan_key: []}
    current_key = orphan_key
    
    for item in data:
        if isinstance(item, str) and item.startswith(prefix):
            current_key = item
            result[current_key] = []
        else:
            result[current_key].append(item)
    
    # Remove orphan key if empty
    if not result[orphan_key]:
        del result[orphan_key]
    
    return result

# Data with orphaned values at start
data = [1, 2, "GROUP-A", 10, 20, "GROUP-B", 30]
print(group_by_prefix_safe(data))
# {'_orphaned': [1, 2], 'GROUP-A': [10, 20], 'GROUP-B': [30]}

Using a Callable Header Detector

Make the grouping logic more flexible:

def group_by_condition(data, is_header):
    """Group items based on a custom header detection function."""
    result = {}
    current_key = None
    
    for item in data:
        if is_header(item):
            current_key = item
            result[current_key] = []
        elif current_key is not None:
            result[current_key].append(item)
    
    return result

# Example: Headers are uppercase strings
data = ["SECTION_A", 1, 2, 3, "SECTION_B", 4, 5]
result = group_by_condition(data, lambda x: isinstance(x, str) and x.isupper())
print(result)
# {'SECTION_A': [1, 2, 3], 'SECTION_B': [4, 5]}

# Example: Headers start with '#'
data = ["# Config", "key=value", "# Database", "host=localhost"]
result = group_by_condition(data, lambda x: x.startswith("#"))
print(result)
# {'# Config': ['key=value'], '# Database': ['host=localhost']}

Output:

{'SECTION_A': [1, 2, 3], 'SECTION_B': [4, 5]}
{'# Config': ['key=value'], '# Database': ['host=localhost']}

Using itertools.groupby

For data with clear header/value markers:

from itertools import groupby

data = [
    ("H", "Users"),
    ("V", "Alice"),
    ("V", "Bob"),
    ("H", "Admins"),
    ("V", "Charlie")
]

def group_with_markers(data):
    result = {}
    current_key = None
    
    for is_header, group in groupby(data, key=lambda x: x[0] == "H"):
        items = list(group)
        if is_header:
            # First item of header group contains the key
            current_key = items[0][1]
            result[current_key] = []
        else:
            # Extend with values
            result[current_key].extend(item[1] for item in items)
    
    return result

print(group_with_markers(data))
# {'Users': ['Alice', 'Bob'], 'Admins': ['Charlie']}

groupby Limitation

itertools.groupby groups consecutive items. If your data isn't sorted by header, values may end up in wrong groups. The state loop method is safer for unpredictable data.

Practical Examples

Log File Parsing

log_lines = [
    "[ERROR] Connection failed",
    "  Timeout after 30s",
    "  Retry count: 3",
    "[WARNING] Disk space low",
    "  Available: 500MB",
    "[ERROR] Authentication failed",
    "  User: admin",
    "  IP: 192.168.1.100"
]

def parse_log_entries(lines):
    entries = []
    current_entry = None
    
    for line in lines:
        if line.startswith("["):
            if current_entry:
                entries.append(current_entry)
            # Extract level and message
            bracket_end = line.index("]")
            level = line[1:bracket_end]
            message = line[bracket_end + 2:]
            current_entry = {"level": level, "message": message, "details": []}
        elif current_entry:
            current_entry["details"].append(line.strip())
    
    if current_entry:
        entries.append(current_entry)
    
    return entries

parsed = parse_log_entries(log_lines)
for entry in parsed:
    print(entry)

Output:

{'level': 'ERROR', 'message': 'Connection failed', 'details': ['Timeout after 30s', 'Retry count: 3']}
{'level': 'WARNING', 'message': 'Disk space low', 'details': ['Available: 500MB']}
{'level': 'ERROR', 'message': 'Authentication failed', 'details': ['User: admin', 'IP: 192.168.1.100']}

Configuration Section Grouping

config_data = [
    "[database]",
    "host=localhost",
    "port=5432",
    "[cache]",
    "enabled=true",
    "ttl=3600",
    "[logging]",
    "level=debug"
]

def parse_ini_style(lines):
    sections = {}
    current_section = None
    
    for line in lines:
        if line.startswith("[") and line.endswith("]"):
            current_section = line[1:-1]
            sections[current_section] = {}
        elif current_section and "=" in line:
            key, value = line.split("=", 1)
            sections[current_section][key] = value
    
    return sections

print(parse_ini_style(config_data))

Output:

{'database': {'host': 'localhost', 'port': '5432'}, 'cache': {'enabled': 'true', 'ttl': '3600'}, 'logging': {'level': 'debug'}}

Numbered Section Grouping

data = [
    "1.", "First item", "First detail",
    "2.", "Second item",
    "3.", "Third item", "Detail A", "Detail B"
]

import re

def group_numbered_sections(items):
    result = {}
    current = None
    
    for item in items:
        if re.match(r"^\d+\.$", item):
            current = item
            result[current] = []
        elif current:
            result[current].append(item)
    
    return result

print(group_numbered_sections(data))

Output:

{'1.': ['First item', 'First detail'], '2.': ['Second item'], '3.': ['Third item', 'Detail A', 'Detail B']}

Using defaultdict for Cleaner Code

from collections import defaultdict

def group_by_prefix_defaultdict(data, prefix):
    result = defaultdict(list)
    current_key = None
    
    for item in data:
        if isinstance(item, str) and item.startswith(prefix):
            current_key = item
        elif current_key:
            result[current_key].append(item)
    
    return dict(result)

data = ["CAT-A", 1, 2, "CAT-B", 3, 4, 5]
print(group_by_prefix_defaultdict(data, "CAT-"))

Output:

{'CAT-A': [1, 2], 'CAT-B': [3, 4, 5]}

Transforming Values During Grouping

def group_and_transform(data, is_header, transform=None):
    """Group with optional value transformation."""
    result = {}
    current_key = None
    
    for item in data:
        if is_header(item):
            current_key = item
            result[current_key] = []
        elif current_key is not None:
            value = transform(item) if transform else item
            result[current_key].append(value)
    
    return result

# Example: Parse string values to integers
data = ["GROUP-A", "10", "20", "GROUP-B", "30"]
result = group_and_transform(
    data,
    is_header=lambda x: x.startswith("GROUP-"),
    transform=int
)
print(result)

Output:

{'GROUP-A': [10, 20], 'GROUP-B': [30]}

Summary

Method	Readability	Flexibility	Best For
State loop	High	High	General-purpose, messy data
With callable	High	Very high	Configurable header detection
groupby	Medium	Medium	Clean, predictable formats
defaultdict	High	Medium	Cleaner initialization

Best Practice

Use the state loop method for most real-world data. It is explicit, easy to debug, and handles edge cases like orphaned values naturally. Reserve itertools.groupby for already-sorted data with clear markers.

State Loop Method (Recommended)​

Handling Values Before First Header​

Using a Callable Header Detector​

Using itertools.groupby​

Practical Examples​

Log File Parsing​

Configuration Section Grouping​

Numbered Section Grouping​

Using defaultdict for Cleaner Code​

Transforming Values During Grouping​

Summary​

Table of Contents

State Loop Method (Recommended)

Handling Values Before First Header

Using a Callable Header Detector

Using itertools.groupby

Practical Examples

Log File Parsing

Configuration Section Grouping

Numbered Section Grouping

Using defaultdict for Cleaner Code

Transforming Values During Grouping

Summary