Pandas - Get Shape of DataFrame (Rows and Columns)

• The `shape` attribute returns a tuple `(rows, columns)` representing DataFrame dimensions, accessible without parentheses since it's a property, not a method

Key Insights

• The shape attribute returns a tuple (rows, columns) representing DataFrame dimensions, accessible without parentheses since it’s a property, not a method • Use len(df) for row count only, len(df.columns) for column count only, and df.size for total element count (rows × columns) • Shape checking is critical before operations like merging, pivoting, or memory-intensive transformations to prevent performance issues and validate data structure

Understanding DataFrame Shape Basics

The shape attribute provides the fundamental dimensions of any pandas DataFrame. It returns a tuple where the first element represents rows and the second represents columns.

import pandas as pd

df = pd.DataFrame({
    'product': ['Laptop', 'Mouse', 'Keyboard', 'Monitor'],
    'price': [1200, 25, 75, 350],
    'quantity': [5, 50, 30, 10]
})

print(df.shape)  # Output: (4, 3)
print(type(df.shape))  # Output: <class 'tuple'>

Access individual dimensions by indexing the tuple:

rows, columns = df.shape
print(f"Rows: {rows}, Columns: {columns}")  # Output: Rows: 4, Columns: 3

# Or access directly
print(f"Row count: {df.shape[0]}")  # Output: Row count: 4
print(f"Column count: {df.shape[1]}")  # Output: Column count: 3

Alternative Methods for Dimension Retrieval

While shape is the most direct approach, pandas offers several alternatives depending on your specific needs.

import pandas as pd

df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'age': [25, 30, 35, 40, 45],
    'salary': [50000, 60000, 70000, 80000, 90000],
    'department': ['IT', 'HR', 'IT', 'Finance', 'HR']
})

# Row count only
row_count = len(df)
print(f"Rows using len(): {row_count}")  # Output: 5

# Column count only
col_count = len(df.columns)
print(f"Columns using len(df.columns): {col_count}")  # Output: 4

# Total elements
total_elements = df.size
print(f"Total elements: {total_elements}")  # Output: 20

# Using index
index_length = len(df.index)
print(f"Rows using index: {index_length}")  # Output: 5

The size attribute calculates total elements by multiplying rows by columns:

assert df.size == df.shape[0] * df.shape[1]  # True
print(f"{df.shape[0]} × {df.shape[1]} = {df.size}")  # Output: 5 × 4 = 20

Shape After Common Operations

Understanding how DataFrame operations affect shape helps prevent unexpected results and debug transformation pipelines.

import pandas as pd

df = pd.DataFrame({
    'category': ['A', 'B', 'A', 'B', 'A', 'C'],
    'value': [10, 20, 30, 40, 50, 60],
    'metric': [1.5, 2.5, 3.5, 4.5, 5.5, 6.5]
})

print(f"Original shape: {df.shape}")  # Output: (6, 3)

# Filtering reduces rows
filtered = df[df['value'] > 25]
print(f"After filtering: {filtered.shape}")  # Output: (4, 3)

# Selecting columns reduces columns
selected = df[['category', 'value']]
print(f"After column selection: {selected.shape}")  # Output: (6, 2)

# Groupby aggregation changes both dimensions
grouped = df.groupby('category').agg({'value': 'sum', 'metric': 'mean'})
print(f"After groupby: {grouped.shape}")  # Output: (3, 2)

# Adding columns increases column count
df['new_column'] = df['value'] * 2
print(f"After adding column: {df.shape}")  # Output: (6, 4)

# Dropping duplicates may reduce rows
df_with_dupes = pd.DataFrame({
    'A': [1, 1, 2, 2, 3],
    'B': [4, 4, 5, 6, 7]
})
print(f"With duplicates: {df_with_dupes.shape}")  # Output: (5, 2)
deduped = df_with_dupes.drop_duplicates()
print(f"After deduplication: {deduped.shape}")  # Output: (4, 2)

Validation and Conditional Logic

Shape checking enables robust data validation before processing operations.

import pandas as pd

def validate_dataframe(df, min_rows=1, expected_columns=None):
    """Validate DataFrame dimensions before processing."""
    rows, cols = df.shape
    
    if rows < min_rows:
        raise ValueError(f"Insufficient data: {rows} rows, need at least {min_rows}")
    
    if expected_columns and cols != expected_columns:
        raise ValueError(f"Column mismatch: got {cols}, expected {expected_columns}")
    
    return True

# Example usage
df = pd.DataFrame({
    'id': [1, 2, 3],
    'name': ['X', 'Y', 'Z'],
    'value': [100, 200, 300]
})

try:
    validate_dataframe(df, min_rows=2, expected_columns=3)
    print("Validation passed")
except ValueError as e:
    print(f"Validation failed: {e}")

Conditional processing based on DataFrame size:

import pandas as pd

df = pd.DataFrame({'data': range(1000)})

# Choose processing strategy based on size
if df.shape[0] > 10000:
    # Use chunking for large datasets
    chunk_size = 1000
    print(f"Large dataset detected ({df.shape[0]} rows), using chunked processing")
elif df.shape[0] > 100:
    # Standard processing
    print(f"Medium dataset ({df.shape[0]} rows), using standard processing")
else:
    # Direct processing
    print(f"Small dataset ({df.shape[0]} rows), processing directly")

Memory Estimation Using Shape

Calculate memory requirements before loading or transforming large datasets.

import pandas as pd
import numpy as np

df = pd.DataFrame({
    'int_col': np.random.randint(0, 100, 10000),
    'float_col': np.random.random(10000),
    'str_col': ['text'] * 10000
})

rows, cols = df.shape

# Memory usage per column
print(df.memory_usage(deep=True))

# Estimate memory for scaling
estimated_rows = 1_000_000
memory_per_row = df.memory_usage(deep=True).sum() / rows
estimated_memory_mb = (estimated_rows * memory_per_row) / (1024 ** 2)

print(f"\nCurrent shape: {df.shape}")
print(f"Current memory: {df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")
print(f"Estimated memory for {estimated_rows:,} rows: {estimated_memory_mb:.2f} MB")

Shape in Merge and Join Operations

Verify shape changes during merge operations to catch unexpected duplications or data loss.

import pandas as pd

df1 = pd.DataFrame({
    'key': ['A', 'B', 'C'],
    'value1': [1, 2, 3]
})

df2 = pd.DataFrame({
    'key': ['A', 'B', 'B', 'D'],
    'value2': [10, 20, 25, 30]
})

print(f"df1 shape: {df1.shape}")  # Output: (3, 2)
print(f"df2 shape: {df2.shape}")  # Output: (4, 2)

# Inner join
inner = pd.merge(df1, df2, on='key', how='inner')
print(f"Inner join shape: {inner.shape}")  # Output: (3, 3) - B appears twice

# Left join
left = pd.merge(df1, df2, on='key', how='left')
print(f"Left join shape: {left.shape}")  # Output: (4, 3)

# Outer join
outer = pd.merge(df1, df2, on='key', how='outer')
print(f"Outer join shape: {outer.shape}")  # Output: (5, 3)

# Validate merge results
expected_rows = 3
if inner.shape[0] != expected_rows:
    print(f"Warning: Merge produced {inner.shape[0]} rows, expected {expected_rows}")

Empty and Single-Dimension DataFrames

Handle edge cases where DataFrames have zero rows or columns.

import pandas as pd

# Empty DataFrame
empty_df = pd.DataFrame()
print(f"Empty DataFrame shape: {empty_df.shape}")  # Output: (0, 0)
print(f"Is empty: {empty_df.empty}")  # Output: True

# DataFrame with columns but no rows
df_no_rows = pd.DataFrame(columns=['A', 'B', 'C'])
print(f"No rows shape: {df_no_rows.shape}")  # Output: (0, 3)
print(f"Is empty: {df_no_rows.empty}")  # Output: True

# Single column DataFrame
single_col = pd.DataFrame({'A': [1, 2, 3]})
print(f"Single column shape: {single_col.shape}")  # Output: (3, 1)

# Single row DataFrame
single_row = pd.DataFrame({'A': [1], 'B': [2], 'C': [3]})
print(f"Single row shape: {single_row.shape}")  # Output: (1, 3)

# Safe processing with shape checks
def process_dataframe(df):
    if df.shape[0] == 0:
        print("Cannot process: DataFrame has no rows")
        return None
    if df.shape[1] == 0:
        print("Cannot process: DataFrame has no columns")
        return None
    return df.sum()

print(process_dataframe(empty_df))  # Output: Cannot process: DataFrame has no rows

Liked this? There's more.

Every week: one practical technique, explained simply, with code you can use immediately.