Python - Regular Expressions (re module) Guide

Key Insights

The re module provides powerful pattern matching with compile-once-use-many optimization through compiled pattern objects, significantly improving performance in loops
Named capture groups and lookahead/lookbehind assertions enable complex text extraction without consuming characters, critical for parsing structured data
Understanding the difference between match(), search(), and findall() prevents common bugs—match() only checks string start, while search() scans the entire string

Core Pattern Matching Methods

The re module offers four primary methods for pattern matching, each suited for different scenarios. Understanding when to use each prevents unnecessary complexity.

import re

text = "Contact: john@example.com or support@company.org"

# match() - checks pattern at string start only
result = re.match(r'\w+@\w+\.\w+', text)
print(result)  # None - pattern not at start

# search() - finds first occurrence anywhere
result = re.search(r'\w+@\w+\.\w+', text)
print(result.group())  # john@example.com

# findall() - returns all non-overlapping matches
emails = re.findall(r'\w+@\w+\.\w+', text)
print(emails)  # ['john@example.com', 'support@company.org']

# finditer() - returns iterator of match objects
for match in re.finditer(r'\w+@\w+\.\w+', text):
    print(f"Found {match.group()} at position {match.start()}")

Compiled Patterns for Performance

Compiling patterns once and reusing them provides significant performance gains when processing multiple strings or working in loops.

import re
import time

# Pattern used in loop - inefficient
def slow_validation(emails):
    valid = []
    for email in emails:
        if re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', email):
            valid.append(email)
    return valid

# Compiled pattern - efficient
def fast_validation(emails):
    pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
    return [email for email in emails if pattern.match(email)]

# Performance comparison
test_emails = ['user@example.com', 'invalid', 'test@test.co'] * 10000

start = time.time()
slow_validation(test_emails)
print(f"Without compile: {time.time() - start:.4f}s")

start = time.time()
fast_validation(test_emails)
print(f"With compile: {time.time() - start:.4f}s")

Capture Groups and Named Groups

Capture groups extract specific portions of matched text. Named groups improve code readability and maintainability.

import re

# Parsing log entries with numbered groups
log_line = "2024-01-15 14:23:45 ERROR Database connection failed"
pattern = r'(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2}) (\w+) (.+)'
match = re.search(pattern, log_line)

if match:
    date, time, level, message = match.groups()
    print(f"Level: {level}, Message: {message}")

# Named groups for clarity
pattern = r'(?P<date>\d{4}-\d{2}-\d{2}) (?P<time>\d{2}:\d{2}:\d{2}) (?P<level>\w+) (?P<message>.+)'
match = re.search(pattern, log_line)

if match:
    print(f"Level: {match.group('level')}")
    print(f"Message: {match.group('message')}")
    print(match.groupdict())  # Returns dict of all named groups

# Non-capturing groups with (?:...)
url = "https://api.example.com/v1/users"
pattern = r'(?:https?://)?([^/]+)'  # Don't capture protocol
match = re.search(pattern, url)
print(match.group(1))  # api.example.com

Substitution and Replacement

The sub() and subn() methods enable pattern-based text replacement with support for backreferences and callable replacements.

import re

# Basic substitution
text = "Price: $99.99, Tax: $8.50"
result = re.sub(r'\$(\d+\.\d{2})', r'USD \1', text)
print(result)  # Price: USD 99.99, Tax: USD 8.50

# Using callable for dynamic replacement
def convert_to_uppercase(match):
    return match.group(0).upper()

text = "error: connection failed, warning: timeout"
result = re.sub(r'\b(error|warning)\b', convert_to_uppercase, text)
print(result)  # ERROR: connection failed, WARNING: timeout

# subn() returns tuple with count
text = "foo bar foo baz"
result, count = re.subn(r'foo', 'qux', text)
print(f"Result: {result}, Replacements: {count}")  # qux bar qux baz, 2

# Backreferences in replacement
text = "John Doe, Jane Smith"
pattern = r'(\w+) (\w+)'
result = re.sub(pattern, r'\2, \1', text)
print(result)  # Doe, John, Smith, Jane

Lookahead and Lookbehind Assertions

Assertions match positions without consuming characters, essential for complex parsing requirements.

import re

# Positive lookahead (?=...)
text = "password123, secret456, public789"
# Find words followed by digits but don't include digits
pattern = r'\w+(?=\d+)'
matches = re.findall(pattern, text)
print(matches)  # ['password', 'secret', 'public']

# Negative lookahead (?!...)
# Find words NOT followed by digits
pattern = r'\b\w+\b(?!\d)'
text = "item1 item2 standalone"
matches = re.findall(pattern, text)
print(matches)  # ['standalone']

# Positive lookbehind (?<=...)
text = "$100 €200 £300"
# Extract amounts after currency symbols
pattern = r'(?<=[$€£])\d+'
amounts = re.findall(pattern, text)
print(amounts)  # ['100', '200', '300']

# Negative lookbehind (?<!...)
# Find numbers NOT preceded by currency
text = "Price $50, quantity 10"
pattern = r'(?<![$€£])\b\d+\b'
matches = re.findall(pattern, text)
print(matches)  # ['10']

# Combining assertions for complex extraction
# Extract domain from email without @ symbol
email = "user@example.com"
pattern = r'(?<=@)[a-zA-Z0-9.-]+(?=\.com)'
domain = re.search(pattern, email)
print(domain.group())  # example

Flags and Modifiers

Regex flags modify pattern matching behavior. Multiple flags combine with the bitwise OR operator.

import re

# Case-insensitive matching
text = "Python PYTHON python"
matches = re.findall(r'python', text, re.IGNORECASE)
print(len(matches))  # 3

# Multiline mode - ^ and $ match line boundaries
text = """first line
second line
third line"""
matches = re.findall(r'^[a-z]+', text, re.MULTILINE)
print(matches)  # ['first', 'second', 'third']

# DOTALL - dot matches newlines
text = "line1\nline2\nline3"
match = re.search(r'line1.*line3', text, re.DOTALL)
print(match.group())  # Matches across lines

# Verbose mode for readable patterns
pattern = re.compile(r'''
    ^                   # Start of string
    (?P<protocol>https?)  # Protocol
    ://                 # Separator
    (?P<domain>[^/]+)   # Domain
    (?P<path>/.*)?      # Optional path
    $                   # End of string
''', re.VERBOSE)

url = "https://example.com/api/users"
match = pattern.match(url)
print(match.groupdict())

# Combining flags
pattern = re.compile(r'^start.*end$', re.IGNORECASE | re.DOTALL | re.MULTILINE)

Practical Validation Patterns

Real-world validation scenarios require robust patterns that handle edge cases.

import re

class Validator:
    # Email validation (RFC 5322 simplified)
    EMAIL = re.compile(
        r'^[a-zA-Z0-9.!#$%&\'*+/=?^_`{|}~-]+@'
        r'[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?'
        r'(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$'
    )
    
    # Phone number (US format)
    PHONE = re.compile(r'^\+?1?\s*\(?(\d{3})\)?[\s.-]?(\d{3})[\s.-]?(\d{4})$')
    
    # IPv4 address
    IPV4 = re.compile(
        r'^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}'
        r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$'
    )
    
    # URL validation
    URL = re.compile(
        r'^https?://'
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'
        r'localhost|'
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
        r'(?::\d+)?'
        r'(?:/?|[/?]\S+)$', re.IGNORECASE
    )
    
    @classmethod
    def validate_email(cls, email):
        return bool(cls.EMAIL.match(email))
    
    @classmethod
    def validate_phone(cls, phone):
        match = cls.PHONE.match(phone)
        if match:
            return f"({match.group(1)}) {match.group(2)}-{match.group(3)}"
        return None

# Usage
print(Validator.validate_email("user@example.com"))  # True
print(Validator.validate_phone("5551234567"))  # (555) 123-4567
print(Validator.IPV4.match("192.168.1.1"))  # Match object
print(Validator.URL.match("https://example.com/path"))  # Match object

Splitting and Tokenization

The split() method provides advanced string splitting with pattern-based delimiters.

import re

# Split on multiple delimiters
text = "apple,banana;cherry|date:elderberry"
fruits = re.split(r'[,;|:]', text)
print(fruits)  # ['apple', 'banana', 'cherry', 'date', 'elderberry']

# Split with capture groups preserves delimiters
text = "one1two2three3four"
parts = re.split(r'(\d+)', text)
print(parts)  # ['one', '1', 'two', '2', 'three', '3', 'four']

# Limit splits
text = "a:b:c:d:e"
parts = re.split(r':', text, maxsplit=2)
print(parts)  # ['a', 'b', 'c:d:e']

# Tokenize code-like syntax
code = "func(arg1, arg2, 'string value')"
tokens = re.findall(r'\w+|[^\w\s]', code)
print(tokens)  # ['func', '(', 'arg1', ',', 'arg2', ',', "'", 'string', ...]

The re module’s power lies in understanding method selection, compilation optimization, and assertion mechanics. Master these fundamentals before attempting complex patterns.