Pandas - Get DataFrame Info and Memory Usage
The `info()` method is your first stop when examining a new DataFrame. It displays the DataFrame's structure, including the number of entries, column names, non-null counts, data types, and memory...
Key Insights
- The
info()method provides a comprehensive overview of DataFrame structure, including column types, non-null counts, and memory usage, making it essential for initial data exploration and debugging - Memory optimization in Pandas can reduce DataFrame size by 50-90% through strategic dtype conversion, particularly by downcasting numeric types and using categorical data for low-cardinality columns
- The
memory_usage()method offers granular control for analyzing memory consumption at the column level, enabling targeted optimization of large datasets
Understanding DataFrame Info with info()
The info() method is your first stop when examining a new DataFrame. It displays the DataFrame’s structure, including the number of entries, column names, non-null counts, data types, and memory usage.
import pandas as pd
import numpy as np
# Create sample DataFrame
df = pd.DataFrame({
'user_id': range(1000),
'username': [f'user_{i}' for i in range(1000)],
'age': np.random.randint(18, 80, 1000),
'score': np.random.uniform(0, 100, 1000),
'premium': np.random.choice([True, False], 1000),
'signup_date': pd.date_range('2020-01-01', periods=1000, freq='D')
})
df.info()
Output:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 user_id 1000 non-null int64
1 username 1000 non-null object
2 age 1000 non-null int64
3 score 1000 non-null float64
4 premium 1000 non-null bool
5 signup_date 1000 non-null datetime64[ns]
dtypes: bool(1), datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 40.3+ KB
The info() method accepts several parameters for customization:
# Show detailed memory usage including index
df.info(memory_usage='deep')
# Limit output to first and last 5 columns (useful for wide DataFrames)
df.info(max_cols=10)
# Show all columns regardless of DataFrame size
df.info(show_counts=True, verbose=True)
# Suppress output (useful in scripts)
buffer = df.info(buf=None, verbose=False)
Analyzing Memory Usage with memory_usage()
The memory_usage() method provides column-level memory consumption details, returning a Series with memory usage in bytes for each column.
# Get memory usage per column
memory_stats = df.memory_usage(deep=True)
print(memory_stats)
Output:
Index 128
user_id 8000
username 62712
age 8000
score 8000
premium 1000
signup_date 8000
dtype: int64
The deep=True parameter is critical for accurate memory measurement, especially for object dtypes:
# Shallow vs deep memory inspection
print("Shallow memory usage:")
print(df.memory_usage())
print("\nDeep memory usage:")
print(df.memory_usage(deep=True))
# Calculate total memory
total_shallow = df.memory_usage().sum() / 1024**2 # Convert to MB
total_deep = df.memory_usage(deep=True).sum() / 1024**2
print(f"\nShallow total: {total_shallow:.2f} MB")
print(f"Deep total: {total_deep:.2f} MB")
Memory Optimization Through Dtype Conversion
Pandas defaults to 64-bit integers and floats, which often waste memory. Downcasting to smaller dtypes can dramatically reduce memory footprint.
# Create a larger DataFrame for optimization
df_large = pd.DataFrame({
'tiny_int': np.random.randint(0, 100, 100000),
'small_int': np.random.randint(0, 1000, 100000),
'medium_int': np.random.randint(0, 100000, 100000),
'float_col': np.random.uniform(0, 100, 100000),
'category_col': np.random.choice(['A', 'B', 'C', 'D'], 100000)
})
print("Original memory usage:")
print(df_large.memory_usage(deep=True))
print(f"Total: {df_large.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
# Optimize integer columns
df_large['tiny_int'] = df_large['tiny_int'].astype('int8')
df_large['small_int'] = df_large['small_int'].astype('int16')
df_large['medium_int'] = df_large['medium_int'].astype('int32')
# Optimize float column
df_large['float_col'] = df_large['float_col'].astype('float32')
# Convert to categorical
df_large['category_col'] = df_large['category_col'].astype('category')
print("\nOptimized memory usage:")
print(df_large.memory_usage(deep=True))
print(f"Total: {df_large.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
Automated Memory Optimization Function
Create a reusable function to automatically optimize DataFrame memory:
def optimize_dataframe(df, aggressive=False):
"""
Optimize DataFrame memory usage through dtype conversion.
Parameters:
-----------
df : pd.DataFrame
DataFrame to optimize
aggressive : bool
If True, convert all object columns to category
Returns:
--------
pd.DataFrame
Optimized DataFrame
"""
initial_memory = df.memory_usage(deep=True).sum() / 1024**2
for col in df.columns:
col_type = df[col].dtype
# Optimize integers
if col_type == 'int64':
c_min = df[col].min()
c_max = df[col].max()
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
# Optimize floats
elif col_type == 'float64':
df[col] = df[col].astype(np.float32)
# Optimize objects to category
elif col_type == 'object':
if aggressive or df[col].nunique() / len(df) < 0.5:
df[col] = df[col].astype('category')
final_memory = df.memory_usage(deep=True).sum() / 1024**2
reduction = 100 * (initial_memory - final_memory) / initial_memory
print(f"Memory reduced from {initial_memory:.2f} MB to {final_memory:.2f} MB")
print(f"Reduction: {reduction:.1f}%")
return df
# Test the optimization function
df_test = pd.DataFrame({
'id': range(50000),
'value': np.random.randint(0, 100, 50000),
'price': np.random.uniform(10, 1000, 50000),
'status': np.random.choice(['active', 'inactive', 'pending'], 50000)
})
df_optimized = optimize_dataframe(df_test.copy(), aggressive=True)
Identifying Memory-Hungry Columns
Quickly identify which columns consume the most memory to target optimization efforts:
def analyze_memory_by_column(df, top_n=10):
"""Display top N memory-consuming columns."""
memory_per_column = df.memory_usage(deep=True).sort_values(ascending=False)
# Exclude index
memory_per_column = memory_per_column[memory_per_column.index != 'Index']
# Convert to MB and create summary
summary = pd.DataFrame({
'Column': memory_per_column.head(top_n).index,
'Memory_MB': memory_per_column.head(top_n).values / 1024**2,
'Dtype': [df[col].dtype for col in memory_per_column.head(top_n).index],
'Unique_Values': [df[col].nunique() for col in memory_per_column.head(top_n).index]
})
summary['Pct_of_Total'] = 100 * summary['Memory_MB'] / (df.memory_usage(deep=True).sum() / 1024**2)
return summary
# Example usage
df_analysis = pd.DataFrame({
'text_col': ['long string ' * 100 for _ in range(10000)],
'int_col': range(10000),
'cat_col': np.random.choice(['A', 'B', 'C'], 10000)
})
print(analyze_memory_by_column(df_analysis))
Monitoring Memory During Data Processing
Track memory usage throughout your data pipeline to identify bottlenecks:
import time
class MemoryMonitor:
def __init__(self, df):
self.df = df
self.checkpoints = []
def checkpoint(self, label):
"""Record memory usage at a specific point."""
memory_mb = self.df.memory_usage(deep=True).sum() / 1024**2
self.checkpoints.append({
'label': label,
'memory_mb': memory_mb,
'timestamp': time.time()
})
def report(self):
"""Generate memory usage report."""
report_df = pd.DataFrame(self.checkpoints)
if len(report_df) > 1:
report_df['delta_mb'] = report_df['memory_mb'].diff()
return report_df
# Example usage
df_monitor = pd.DataFrame({'values': range(100000)})
monitor = MemoryMonitor(df_monitor)
monitor.checkpoint('Initial load')
df_monitor['squared'] = df_monitor['values'] ** 2
monitor.checkpoint('After squared calculation')
df_monitor['cubed'] = df_monitor['values'] ** 3
monitor.checkpoint('After cubed calculation')
print(monitor.report())
These techniques provide comprehensive control over DataFrame memory usage, enabling you to work with larger datasets efficiently and identify optimization opportunities in production environments.