Pandas - Get Shape of DataFrame (Rows and Columns)
• The `shape` attribute returns a tuple `(rows, columns)` representing DataFrame dimensions, accessible without parentheses since it's a property, not a method
Key Insights
• The shape attribute returns a tuple (rows, columns) representing DataFrame dimensions, accessible without parentheses since it’s a property, not a method
• Use len(df) for row count only, len(df.columns) for column count only, and df.size for total element count (rows × columns)
• Shape checking is critical before operations like merging, pivoting, or memory-intensive transformations to prevent performance issues and validate data structure
Understanding DataFrame Shape Basics
The shape attribute provides the fundamental dimensions of any pandas DataFrame. It returns a tuple where the first element represents rows and the second represents columns.
import pandas as pd
df = pd.DataFrame({
'product': ['Laptop', 'Mouse', 'Keyboard', 'Monitor'],
'price': [1200, 25, 75, 350],
'quantity': [5, 50, 30, 10]
})
print(df.shape) # Output: (4, 3)
print(type(df.shape)) # Output: <class 'tuple'>
Access individual dimensions by indexing the tuple:
rows, columns = df.shape
print(f"Rows: {rows}, Columns: {columns}") # Output: Rows: 4, Columns: 3
# Or access directly
print(f"Row count: {df.shape[0]}") # Output: Row count: 4
print(f"Column count: {df.shape[1]}") # Output: Column count: 3
Alternative Methods for Dimension Retrieval
While shape is the most direct approach, pandas offers several alternatives depending on your specific needs.
import pandas as pd
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
'age': [25, 30, 35, 40, 45],
'salary': [50000, 60000, 70000, 80000, 90000],
'department': ['IT', 'HR', 'IT', 'Finance', 'HR']
})
# Row count only
row_count = len(df)
print(f"Rows using len(): {row_count}") # Output: 5
# Column count only
col_count = len(df.columns)
print(f"Columns using len(df.columns): {col_count}") # Output: 4
# Total elements
total_elements = df.size
print(f"Total elements: {total_elements}") # Output: 20
# Using index
index_length = len(df.index)
print(f"Rows using index: {index_length}") # Output: 5
The size attribute calculates total elements by multiplying rows by columns:
assert df.size == df.shape[0] * df.shape[1] # True
print(f"{df.shape[0]} × {df.shape[1]} = {df.size}") # Output: 5 × 4 = 20
Shape After Common Operations
Understanding how DataFrame operations affect shape helps prevent unexpected results and debug transformation pipelines.
import pandas as pd
df = pd.DataFrame({
'category': ['A', 'B', 'A', 'B', 'A', 'C'],
'value': [10, 20, 30, 40, 50, 60],
'metric': [1.5, 2.5, 3.5, 4.5, 5.5, 6.5]
})
print(f"Original shape: {df.shape}") # Output: (6, 3)
# Filtering reduces rows
filtered = df[df['value'] > 25]
print(f"After filtering: {filtered.shape}") # Output: (4, 3)
# Selecting columns reduces columns
selected = df[['category', 'value']]
print(f"After column selection: {selected.shape}") # Output: (6, 2)
# Groupby aggregation changes both dimensions
grouped = df.groupby('category').agg({'value': 'sum', 'metric': 'mean'})
print(f"After groupby: {grouped.shape}") # Output: (3, 2)
# Adding columns increases column count
df['new_column'] = df['value'] * 2
print(f"After adding column: {df.shape}") # Output: (6, 4)
# Dropping duplicates may reduce rows
df_with_dupes = pd.DataFrame({
'A': [1, 1, 2, 2, 3],
'B': [4, 4, 5, 6, 7]
})
print(f"With duplicates: {df_with_dupes.shape}") # Output: (5, 2)
deduped = df_with_dupes.drop_duplicates()
print(f"After deduplication: {deduped.shape}") # Output: (4, 2)
Validation and Conditional Logic
Shape checking enables robust data validation before processing operations.
import pandas as pd
def validate_dataframe(df, min_rows=1, expected_columns=None):
"""Validate DataFrame dimensions before processing."""
rows, cols = df.shape
if rows < min_rows:
raise ValueError(f"Insufficient data: {rows} rows, need at least {min_rows}")
if expected_columns and cols != expected_columns:
raise ValueError(f"Column mismatch: got {cols}, expected {expected_columns}")
return True
# Example usage
df = pd.DataFrame({
'id': [1, 2, 3],
'name': ['X', 'Y', 'Z'],
'value': [100, 200, 300]
})
try:
validate_dataframe(df, min_rows=2, expected_columns=3)
print("Validation passed")
except ValueError as e:
print(f"Validation failed: {e}")
Conditional processing based on DataFrame size:
import pandas as pd
df = pd.DataFrame({'data': range(1000)})
# Choose processing strategy based on size
if df.shape[0] > 10000:
# Use chunking for large datasets
chunk_size = 1000
print(f"Large dataset detected ({df.shape[0]} rows), using chunked processing")
elif df.shape[0] > 100:
# Standard processing
print(f"Medium dataset ({df.shape[0]} rows), using standard processing")
else:
# Direct processing
print(f"Small dataset ({df.shape[0]} rows), processing directly")
Memory Estimation Using Shape
Calculate memory requirements before loading or transforming large datasets.
import pandas as pd
import numpy as np
df = pd.DataFrame({
'int_col': np.random.randint(0, 100, 10000),
'float_col': np.random.random(10000),
'str_col': ['text'] * 10000
})
rows, cols = df.shape
# Memory usage per column
print(df.memory_usage(deep=True))
# Estimate memory for scaling
estimated_rows = 1_000_000
memory_per_row = df.memory_usage(deep=True).sum() / rows
estimated_memory_mb = (estimated_rows * memory_per_row) / (1024 ** 2)
print(f"\nCurrent shape: {df.shape}")
print(f"Current memory: {df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")
print(f"Estimated memory for {estimated_rows:,} rows: {estimated_memory_mb:.2f} MB")
Shape in Merge and Join Operations
Verify shape changes during merge operations to catch unexpected duplications or data loss.
import pandas as pd
df1 = pd.DataFrame({
'key': ['A', 'B', 'C'],
'value1': [1, 2, 3]
})
df2 = pd.DataFrame({
'key': ['A', 'B', 'B', 'D'],
'value2': [10, 20, 25, 30]
})
print(f"df1 shape: {df1.shape}") # Output: (3, 2)
print(f"df2 shape: {df2.shape}") # Output: (4, 2)
# Inner join
inner = pd.merge(df1, df2, on='key', how='inner')
print(f"Inner join shape: {inner.shape}") # Output: (3, 3) - B appears twice
# Left join
left = pd.merge(df1, df2, on='key', how='left')
print(f"Left join shape: {left.shape}") # Output: (4, 3)
# Outer join
outer = pd.merge(df1, df2, on='key', how='outer')
print(f"Outer join shape: {outer.shape}") # Output: (5, 3)
# Validate merge results
expected_rows = 3
if inner.shape[0] != expected_rows:
print(f"Warning: Merge produced {inner.shape[0]} rows, expected {expected_rows}")
Empty and Single-Dimension DataFrames
Handle edge cases where DataFrames have zero rows or columns.
import pandas as pd
# Empty DataFrame
empty_df = pd.DataFrame()
print(f"Empty DataFrame shape: {empty_df.shape}") # Output: (0, 0)
print(f"Is empty: {empty_df.empty}") # Output: True
# DataFrame with columns but no rows
df_no_rows = pd.DataFrame(columns=['A', 'B', 'C'])
print(f"No rows shape: {df_no_rows.shape}") # Output: (0, 3)
print(f"Is empty: {df_no_rows.empty}") # Output: True
# Single column DataFrame
single_col = pd.DataFrame({'A': [1, 2, 3]})
print(f"Single column shape: {single_col.shape}") # Output: (3, 1)
# Single row DataFrame
single_row = pd.DataFrame({'A': [1], 'B': [2], 'C': [3]})
print(f"Single row shape: {single_row.shape}") # Output: (1, 3)
# Safe processing with shape checks
def process_dataframe(df):
if df.shape[0] == 0:
print("Cannot process: DataFrame has no rows")
return None
if df.shape[1] == 0:
print("Cannot process: DataFrame has no columns")
return None
return df.sum()
print(process_dataframe(empty_df)) # Output: Cannot process: DataFrame has no rows