Pandas - Count NaN/NULL Values in Column

Key Insights

• Pandas provides multiple methods to count NaN values including isna(), isnull(), and value_counts(dropna=False), each suited for different use cases and performance requirements. • Understanding the distinction between NaN, None, and pd.NA is critical when working with missing data across different dtype contexts including nullable integer and string types. • Combining NaN counting techniques with groupby operations and visualization enables comprehensive missing data analysis across entire datasets.

Basic NaN Counting Methods

The most straightforward approach to count NaN values in a Pandas column uses the isna() method combined with sum(). Since isna() returns a boolean Series where True indicates missing values, summing converts True to 1 and False to 0.

import pandas as pd
import numpy as np

# Create sample DataFrame
df = pd.DataFrame({
    'product': ['A', 'B', 'C', 'D', 'E', 'F'],
    'price': [10.5, np.nan, 15.0, np.nan, 12.5, np.nan],
    'quantity': [100, 200, np.nan, 150, np.nan, 175]
})

# Count NaN values in a single column
nan_count = df['price'].isna().sum()
print(f"NaN count in price: {nan_count}")  # Output: 3

# Alternative using isnull() - functionally identical
nan_count_alt = df['price'].isnull().sum()
print(f"NaN count (isnull): {nan_count_alt}")  # Output: 3

To get both counts and percentages simultaneously:

def nan_summary(series):
    total = len(series)
    nan_count = series.isna().sum()
    nan_percentage = (nan_count / total) * 100
    return {
        'total': total,
        'nan_count': nan_count,
        'valid_count': total - nan_count,
        'nan_percentage': f"{nan_percentage:.2f}%"
    }

print(nan_summary(df['price']))
# Output: {'total': 6, 'nan_count': 3, 'valid_count': 3, 'nan_percentage': '50.00%'}

Counting Across Multiple Columns

When analyzing missing data patterns across an entire DataFrame, apply aggregation methods that operate on all columns simultaneously.

# Count NaN values for all columns
nan_counts = df.isna().sum()
print(nan_counts)
# Output:
# product     0
# price       3
# quantity    2
# dtype: int64

# Get percentage of NaN values per column
nan_percentages = (df.isna().sum() / len(df)) * 100
print(nan_percentages)
# Output:
# product      0.0
# price       50.0
# quantity    33.333333
# dtype: float64

# Create comprehensive summary
summary = pd.DataFrame({
    'total_rows': len(df),
    'nan_count': df.isna().sum(),
    'valid_count': df.notna().sum(),
    'nan_percentage': (df.isna().sum() / len(df) * 100).round(2)
})
print(summary)

For row-wise analysis to identify records with missing data:

# Count NaN values per row
nan_per_row = df.isna().sum(axis=1)
print(nan_per_row)
# Output:
# 0    0
# 1    1
# 2    1
# 3    1
# 4    1
# 5    1
# dtype: int64

# Filter rows with any NaN values
rows_with_nan = df[df.isna().any(axis=1)]
print(f"Rows containing NaN: {len(rows_with_nan)}")  # Output: 5

Using value_counts() for Detailed Analysis

The value_counts() method with dropna=False provides a frequency distribution that includes NaN values, useful for categorical analysis.

df_categorical = pd.DataFrame({
    'status': ['active', 'inactive', np.nan, 'active', np.nan, 'pending', 'active'],
    'region': ['North', 'South', 'East', np.nan, 'North', np.nan, 'West']
})

# Count including NaN values
status_counts = df_categorical['status'].value_counts(dropna=False)
print(status_counts)
# Output:
# active      3
# NaN         2
# inactive    1
# pending     1
# Name: status, dtype: int64

# Extract just the NaN count
nan_count = status_counts.get(np.nan, 0)
print(f"NaN count: {nan_count}")  # Output: 2

Handling Different Missing Value Types

Pandas uses different representations for missing data depending on the dtype. Understanding these distinctions prevents counting errors.

# Traditional float column with np.nan
float_col = pd.Series([1.0, np.nan, 3.0, None])
print(f"Float NaN count: {float_col.isna().sum()}")  # Output: 2

# Nullable integer dtype with pd.NA
int_col = pd.Series([1, 2, pd.NA, 4], dtype='Int64')
print(f"Int64 NaN count: {int_col.isna().sum()}")  # Output: 1

# String dtype with pd.NA
str_col = pd.Series(['a', 'b', pd.NA, 'd'], dtype='string')
print(f"String NaN count: {str_col.isna().sum()}")  # Output: 1

# Mixed object column
obj_col = pd.Series([1, 'text', None, np.nan, pd.NA])
print(f"Object NaN count: {obj_col.isna().sum()}")  # Output: 3

For datasets with specific placeholder values representing missing data:

df_placeholders = pd.DataFrame({
    'temperature': [25.0, -999, 30.0, -999, 28.0],
    'humidity': [60, 0, 65, 70, 0]
})

# Replace placeholders with NaN
df_clean = df_placeholders.replace({
    'temperature': {-999: np.nan},
    'humidity': {0: np.nan}
})

print(df_clean.isna().sum())
# Output:
# temperature    2
# humidity       2
# dtype: int64

GroupBy Operations with NaN Counts

Analyzing missing data patterns across categorical groups reveals data quality issues in specific segments.

df_grouped = pd.DataFrame({
    'category': ['A', 'A', 'B', 'B', 'C', 'C', 'A', 'B'],
    'value1': [10, np.nan, 20, 30, np.nan, 40, 50, np.nan],
    'value2': [100, 200, np.nan, 300, 400, np.nan, 500, 600]
})

# Count NaN values by group
nan_by_group = df_grouped.groupby('category').apply(
    lambda x: x.isna().sum()
)
print(nan_by_group)
# Output:
#          category  value1  value2
# category                         
# A               0       1       0
# B               0       2       1
# C               0       1       1

# Calculate NaN percentage by group
def group_nan_percentage(group):
    return (group.isna().sum() / len(group) * 100).round(2)

nan_pct_by_group = df_grouped.groupby('category').apply(group_nan_percentage)
print(nan_pct_by_group)

Performance Considerations for Large Datasets

When working with large DataFrames, choose the most efficient method for your use case.

import time

# Create large DataFrame
large_df = pd.DataFrame({
    'col1': np.random.choice([1, 2, np.nan], size=1_000_000),
    'col2': np.random.choice([10, 20, np.nan], size=1_000_000)
})

# Method 1: isna().sum()
start = time.time()
count1 = large_df['col1'].isna().sum()
time1 = time.time() - start

# Method 2: pd.isna().sum()
start = time.time()
count2 = pd.isna(large_df['col1']).sum()
time2 = time.time() - start

print(f"Method 1 time: {time1:.4f}s, Method 2 time: {time2:.4f}s")

# For multiple columns, use vectorized operations
start = time.time()
all_counts = large_df.isna().sum()
time_vectorized = time.time() - start
print(f"Vectorized time for all columns: {time_vectorized:.4f}s")

Practical Missing Data Report

Combine these techniques into a reusable function for comprehensive missing data analysis:

def missing_data_report(df, threshold=50):
    """
    Generate comprehensive missing data report.
    
    Parameters:
    - df: DataFrame to analyze
    - threshold: Flag columns with NaN percentage above this value
    """
    total_rows = len(df)
    
    report = pd.DataFrame({
        'dtype': df.dtypes,
        'non_null': df.notna().sum(),
        'null_count': df.isna().sum(),
        'null_pct': (df.isna().sum() / total_rows * 100).round(2)
    })
    
    report['flag'] = report['null_pct'] > threshold
    report = report.sort_values('null_pct', ascending=False)
    
    print(f"Dataset Shape: {df.shape}")
    print(f"Total Rows: {total_rows}")
    print(f"\nColumns with >{threshold}% missing data:")
    print(report[report['flag']])
    
    return report

# Usage
df_test = pd.DataFrame({
    'id': range(100),
    'score': np.random.choice([1, 2, 3, np.nan], 100, p=[0.3, 0.3, 0.2, 0.2]),
    'grade': np.random.choice(['A', 'B', np.nan], 100, p=[0.4, 0.2, 0.4])
})

report = missing_data_report(df_test, threshold=30)

This approach provides immediate insight into data quality issues and guides data cleaning strategies based on the extent and distribution of missing values across your dataset.