Pandas - Get DataFrame Info and Memory Usage

Key Insights

The info() method provides a comprehensive overview of DataFrame structure, including column types, non-null counts, and memory usage, making it essential for initial data exploration and debugging
Memory optimization in Pandas can reduce DataFrame size by 50-90% through strategic dtype conversion, particularly by downcasting numeric types and using categorical data for low-cardinality columns
The memory_usage() method offers granular control for analyzing memory consumption at the column level, enabling targeted optimization of large datasets

Understanding DataFrame Info with info()

The info() method is your first stop when examining a new DataFrame. It displays the DataFrame’s structure, including the number of entries, column names, non-null counts, data types, and memory usage.

import pandas as pd
import numpy as np

# Create sample DataFrame
df = pd.DataFrame({
    'user_id': range(1000),
    'username': [f'user_{i}' for i in range(1000)],
    'age': np.random.randint(18, 80, 1000),
    'score': np.random.uniform(0, 100, 1000),
    'premium': np.random.choice([True, False], 1000),
    'signup_date': pd.date_range('2020-01-01', periods=1000, freq='D')
})

df.info()

Output:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   user_id      1000 non-null   int64         
 1   username     1000 non-null   object        
 2   age          1000 non-null   int64         
 3   score        1000 non-null   float64       
 4   premium      1000 non-null   bool          
 5   signup_date  1000 non-null   datetime64[ns]
dtypes: bool(1), datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 40.3+ KB

The info() method accepts several parameters for customization:

# Show detailed memory usage including index
df.info(memory_usage='deep')

# Limit output to first and last 5 columns (useful for wide DataFrames)
df.info(max_cols=10)

# Show all columns regardless of DataFrame size
df.info(show_counts=True, verbose=True)

# Suppress output (useful in scripts)
buffer = df.info(buf=None, verbose=False)

Analyzing Memory Usage with memory_usage()

The memory_usage() method provides column-level memory consumption details, returning a Series with memory usage in bytes for each column.

# Get memory usage per column
memory_stats = df.memory_usage(deep=True)
print(memory_stats)

Output:

Index          128
user_id       8000
username     62712
age           8000
score         8000
premium       1000
signup_date   8000
dtype: int64

The deep=True parameter is critical for accurate memory measurement, especially for object dtypes:

# Shallow vs deep memory inspection
print("Shallow memory usage:")
print(df.memory_usage())

print("\nDeep memory usage:")
print(df.memory_usage(deep=True))

# Calculate total memory
total_shallow = df.memory_usage().sum() / 1024**2  # Convert to MB
total_deep = df.memory_usage(deep=True).sum() / 1024**2

print(f"\nShallow total: {total_shallow:.2f} MB")
print(f"Deep total: {total_deep:.2f} MB")

Memory Optimization Through Dtype Conversion

Pandas defaults to 64-bit integers and floats, which often waste memory. Downcasting to smaller dtypes can dramatically reduce memory footprint.

# Create a larger DataFrame for optimization
df_large = pd.DataFrame({
    'tiny_int': np.random.randint(0, 100, 100000),
    'small_int': np.random.randint(0, 1000, 100000),
    'medium_int': np.random.randint(0, 100000, 100000),
    'float_col': np.random.uniform(0, 100, 100000),
    'category_col': np.random.choice(['A', 'B', 'C', 'D'], 100000)
})

print("Original memory usage:")
print(df_large.memory_usage(deep=True))
print(f"Total: {df_large.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Optimize integer columns
df_large['tiny_int'] = df_large['tiny_int'].astype('int8')
df_large['small_int'] = df_large['small_int'].astype('int16')
df_large['medium_int'] = df_large['medium_int'].astype('int32')

# Optimize float column
df_large['float_col'] = df_large['float_col'].astype('float32')

# Convert to categorical
df_large['category_col'] = df_large['category_col'].astype('category')

print("\nOptimized memory usage:")
print(df_large.memory_usage(deep=True))
print(f"Total: {df_large.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Automated Memory Optimization Function

Create a reusable function to automatically optimize DataFrame memory:

def optimize_dataframe(df, aggressive=False):
    """
    Optimize DataFrame memory usage through dtype conversion.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame to optimize
    aggressive : bool
        If True, convert all object columns to category
    
    Returns:
    --------
    pd.DataFrame
        Optimized DataFrame
    """
    initial_memory = df.memory_usage(deep=True).sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        # Optimize integers
        if col_type == 'int64':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
        
        # Optimize floats
        elif col_type == 'float64':
            df[col] = df[col].astype(np.float32)
        
        # Optimize objects to category
        elif col_type == 'object':
            if aggressive or df[col].nunique() / len(df) < 0.5:
                df[col] = df[col].astype('category')
    
    final_memory = df.memory_usage(deep=True).sum() / 1024**2
    reduction = 100 * (initial_memory - final_memory) / initial_memory
    
    print(f"Memory reduced from {initial_memory:.2f} MB to {final_memory:.2f} MB")
    print(f"Reduction: {reduction:.1f}%")
    
    return df

# Test the optimization function
df_test = pd.DataFrame({
    'id': range(50000),
    'value': np.random.randint(0, 100, 50000),
    'price': np.random.uniform(10, 1000, 50000),
    'status': np.random.choice(['active', 'inactive', 'pending'], 50000)
})

df_optimized = optimize_dataframe(df_test.copy(), aggressive=True)

Identifying Memory-Hungry Columns

Quickly identify which columns consume the most memory to target optimization efforts:

def analyze_memory_by_column(df, top_n=10):
    """Display top N memory-consuming columns."""
    memory_per_column = df.memory_usage(deep=True).sort_values(ascending=False)
    
    # Exclude index
    memory_per_column = memory_per_column[memory_per_column.index != 'Index']
    
    # Convert to MB and create summary
    summary = pd.DataFrame({
        'Column': memory_per_column.head(top_n).index,
        'Memory_MB': memory_per_column.head(top_n).values / 1024**2,
        'Dtype': [df[col].dtype for col in memory_per_column.head(top_n).index],
        'Unique_Values': [df[col].nunique() for col in memory_per_column.head(top_n).index]
    })
    
    summary['Pct_of_Total'] = 100 * summary['Memory_MB'] / (df.memory_usage(deep=True).sum() / 1024**2)
    
    return summary

# Example usage
df_analysis = pd.DataFrame({
    'text_col': ['long string ' * 100 for _ in range(10000)],
    'int_col': range(10000),
    'cat_col': np.random.choice(['A', 'B', 'C'], 10000)
})

print(analyze_memory_by_column(df_analysis))

Monitoring Memory During Data Processing

Track memory usage throughout your data pipeline to identify bottlenecks:

import time

class MemoryMonitor:
    def __init__(self, df):
        self.df = df
        self.checkpoints = []
    
    def checkpoint(self, label):
        """Record memory usage at a specific point."""
        memory_mb = self.df.memory_usage(deep=True).sum() / 1024**2
        self.checkpoints.append({
            'label': label,
            'memory_mb': memory_mb,
            'timestamp': time.time()
        })
    
    def report(self):
        """Generate memory usage report."""
        report_df = pd.DataFrame(self.checkpoints)
        if len(report_df) > 1:
            report_df['delta_mb'] = report_df['memory_mb'].diff()
        return report_df

# Example usage
df_monitor = pd.DataFrame({'values': range(100000)})
monitor = MemoryMonitor(df_monitor)

monitor.checkpoint('Initial load')

df_monitor['squared'] = df_monitor['values'] ** 2
monitor.checkpoint('After squared calculation')

df_monitor['cubed'] = df_monitor['values'] ** 3
monitor.checkpoint('After cubed calculation')

print(monitor.report())

These techniques provide comprehensive control over DataFrame memory usage, enabling you to work with larger datasets efficiently and identify optimization opportunities in production environments.