Pandas - Memory Optimization Tips | Application Architect

Key Insights

• Pandas DataFrames can consume 10-100x more memory than necessary due to default data types—switching from int64 to int8 or using categorical types can reduce memory usage by 90% or more • Reading CSV files with dtype specification and usecols parameters prevents loading unnecessary data and avoids expensive type inference on large datasets • Chunked processing and memory-mapped files enable analysis of datasets larger than available RAM without resorting to distributed computing frameworks

Understanding Pandas Memory Usage

Pandas defaults to using the largest data types for safety, but this wastes memory. A DataFrame with 1 million rows and a single integer column uses 8MB with int64, but only 1MB with int8 if values fit within -128 to 127.

import pandas as pd
import numpy as np

# Create sample DataFrame
df = pd.DataFrame({
    'id': range(1000000),
    'category': np.random.choice(['A', 'B', 'C'], 1000000),
    'value': np.random.randint(0, 100, 1000000)
})

# Check memory usage
print(df.memory_usage(deep=True))
print(f"Total: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

The deep=True parameter is critical—it includes the actual memory consumed by object types like strings, not just the pointer references.

Optimize Numeric Types

Downcast numeric columns to the smallest type that can hold your data. Pandas provides automatic downcasting, but manual specification gives you control.

def optimize_numeric(df):
    """Downcast numeric columns to smallest possible type"""
    for col in df.select_dtypes(include=['int']).columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    
    for col in df.select_dtypes(include=['float']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    
    return df

# Before optimization
print(f"Before: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Apply optimization
df = optimize_numeric(df)

# After optimization
print(f"After: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

For known ranges, specify types explicitly:

# If you know value ranges
df = pd.DataFrame({
    'small_int': pd.array(range(100), dtype='int8'),      # -128 to 127
    'medium_int': pd.array(range(1000), dtype='int16'),   # -32768 to 32767
    'percentage': pd.array(np.random.random(100), dtype='float32')
})

Use Categorical Data Types

Categorical types are ideal for columns with limited unique values. They store data as integer codes with a mapping to actual values, dramatically reducing memory for repetitive string data.

# String column with repeated values
df = pd.DataFrame({
    'country': np.random.choice(['USA', 'UK', 'Canada', 'Australia'], 1000000)
})

print(f"Object type: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Convert to categorical
df['country'] = df['country'].astype('category')
print(f"Categorical: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Check the reduction
print(f"Categories: {df['country'].cat.categories}")
print(f"Codes sample: {df['country'].cat.codes.head()}")

Rule of thumb: use categorical when unique values are less than 50% of total rows.

def convert_to_categorical(df, threshold=0.5):
    """Convert object columns to categorical if beneficial"""
    for col in df.select_dtypes(include=['object']).columns:
        num_unique = df[col].nunique()
        num_total = len(df[col])
        
        if num_unique / num_total < threshold:
            df[col] = df[col].astype('category')
    
    return df

Optimize Data Loading

Loading data efficiently prevents memory waste from the start. Specify dtypes and load only needed columns.

# Inefficient - loads everything with inferred types
df = pd.read_csv('large_file.csv')

# Efficient - specify types and columns
dtypes = {
    'id': 'int32',
    'category': 'category',
    'value': 'float32',
    'date': 'str'  # Parse separately
}

df = pd.read_csv(
    'large_file.csv',
    dtype=dtypes,
    usecols=['id', 'category', 'value', 'date'],
    parse_dates=['date']
)

For truly large files, use chunking:

def process_large_csv(filepath, chunksize=100000):
    """Process CSV in chunks to avoid memory overflow"""
    results = []
    
    for chunk in pd.read_csv(filepath, chunksize=chunksize):
        # Process each chunk
        chunk = optimize_numeric(chunk)
        chunk = convert_to_categorical(chunk)
        
        # Perform aggregation or filtering
        processed = chunk.groupby('category')['value'].mean()
        results.append(processed)
    
    # Combine results
    return pd.concat(results).groupby(level=0).mean()

summary = process_large_csv('huge_file.csv')

Sparse Data Structures

For DataFrames with mostly zeros or NaN values, sparse data types store only non-zero values.

# Dense array - stores all values
dense = pd.DataFrame({
    'A': [0, 0, 1, 0, 0, 0, 2, 0, 0, 0] * 10000
})

print(f"Dense: {dense.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Sparse array - stores only non-zero values
sparse = dense.astype(pd.SparseDtype('int64', fill_value=0))
print(f"Sparse: {sparse.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Check sparsity
print(f"Density: {sparse['A'].sparse.density:.2%}")

Sparse structures are beneficial when density is below 10-20%.

Memory-Efficient String Operations

String operations create copies. Use inplace operations and avoid chaining when possible.

# Memory inefficient - creates multiple copies
df['text'] = df['text'].str.lower().str.strip().str.replace(' ', '_')

# More efficient - fewer intermediate objects
df['text'] = df['text'].str.lower()
df['text'] = df['text'].str.strip()
df['text'] = df['text'].str.replace(' ', '_')

# Best - use single regex operation
df['text'] = df['text'].str.lower().str.replace(r'^\s+|\s+$', '', regex=True).str.replace(' ', '_')

For large string columns, consider PyArrow string type:

# Requires pandas >= 1.3 and pyarrow
df['text'] = df['text'].astype('string[pyarrow]')

Monitor and Profile Memory Usage

Create utilities to track memory consumption throughout your pipeline.

def memory_usage_mb(df):
    """Return memory usage in MB"""
    return df.memory_usage(deep=True).sum() / 1024**2

def compare_memory(df_before, df_after, label="Optimization"):
    """Compare memory before and after optimization"""
    before = memory_usage_mb(df_before)
    after = memory_usage_mb(df_after)
    reduction = (before - after) / before * 100
    
    print(f"{label}:")
    print(f"  Before: {before:.2f} MB")
    print(f"  After: {after:.2f} MB")
    print(f"  Reduction: {reduction:.1f}%")

# Usage
df_original = df.copy()
df_optimized = optimize_numeric(convert_to_categorical(df))
compare_memory(df_original, df_optimized)

Profile specific operations:

from memory_profiler import profile

@profile
def process_data(filepath):
    df = pd.read_csv(filepath)
    df = optimize_numeric(df)
    df = convert_to_categorical(df)
    return df.groupby('category').agg({'value': ['mean', 'sum']})

Complete Optimization Pipeline

Combine techniques into a reusable pipeline:

def optimize_dataframe(df, categorical_threshold=0.5):
    """Apply all memory optimizations"""
    # Store original memory
    original_memory = memory_usage_mb(df)
    
    # Optimize numeric types
    for col in df.select_dtypes(include=['int']).columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    
    for col in df.select_dtypes(include=['float']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    
    # Convert to categorical where appropriate
    for col in df.select_dtypes(include=['object']).columns:
        if df[col].nunique() / len(df) < categorical_threshold:
            df[col] = df[col].astype('category')
    
    # Report results
    optimized_memory = memory_usage_mb(df)
    print(f"Memory reduced from {original_memory:.2f} MB to {optimized_memory:.2f} MB "
          f"({(1 - optimized_memory/original_memory)*100:.1f}% reduction)")
    
    return df

# Apply to your DataFrame
df = optimize_dataframe(df)

These optimizations are not premature—they’re essential for working with real-world datasets. Apply them at data load time, and your pipelines will run faster and scale further without infrastructure changes.