Pandas - Count NaN/NULL Values in Column

• Pandas provides multiple methods to count NaN values including `isna()`, `isnull()`, and `value_counts(dropna=False)`, each suited for different use cases and performance requirements.

Key Insights

• Pandas provides multiple methods to count NaN values including isna(), isnull(), and value_counts(dropna=False), each suited for different use cases and performance requirements. • Understanding the distinction between NaN, None, and pd.NA is critical when working with missing data across different dtype contexts including nullable integer and string types. • Combining NaN counting techniques with groupby operations and visualization enables comprehensive missing data analysis across entire datasets.

Basic NaN Counting Methods

The most straightforward approach to count NaN values in a Pandas column uses the isna() method combined with sum(). Since isna() returns a boolean Series where True indicates missing values, summing converts True to 1 and False to 0.

import pandas as pd
import numpy as np

# Create sample DataFrame
df = pd.DataFrame({
    'product': ['A', 'B', 'C', 'D', 'E', 'F'],
    'price': [10.5, np.nan, 15.0, np.nan, 12.5, np.nan],
    'quantity': [100, 200, np.nan, 150, np.nan, 175]
})

# Count NaN values in a single column
nan_count = df['price'].isna().sum()
print(f"NaN count in price: {nan_count}")  # Output: 3

# Alternative using isnull() - functionally identical
nan_count_alt = df['price'].isnull().sum()
print(f"NaN count (isnull): {nan_count_alt}")  # Output: 3

To get both counts and percentages simultaneously:

def nan_summary(series):
    total = len(series)
    nan_count = series.isna().sum()
    nan_percentage = (nan_count / total) * 100
    return {
        'total': total,
        'nan_count': nan_count,
        'valid_count': total - nan_count,
        'nan_percentage': f"{nan_percentage:.2f}%"
    }

print(nan_summary(df['price']))
# Output: {'total': 6, 'nan_count': 3, 'valid_count': 3, 'nan_percentage': '50.00%'}

Counting Across Multiple Columns

When analyzing missing data patterns across an entire DataFrame, apply aggregation methods that operate on all columns simultaneously.

# Count NaN values for all columns
nan_counts = df.isna().sum()
print(nan_counts)
# Output:
# product     0
# price       3
# quantity    2
# dtype: int64

# Get percentage of NaN values per column
nan_percentages = (df.isna().sum() / len(df)) * 100
print(nan_percentages)
# Output:
# product      0.0
# price       50.0
# quantity    33.333333
# dtype: float64

# Create comprehensive summary
summary = pd.DataFrame({
    'total_rows': len(df),
    'nan_count': df.isna().sum(),
    'valid_count': df.notna().sum(),
    'nan_percentage': (df.isna().sum() / len(df) * 100).round(2)
})
print(summary)

For row-wise analysis to identify records with missing data:

# Count NaN values per row
nan_per_row = df.isna().sum(axis=1)
print(nan_per_row)
# Output:
# 0    0
# 1    1
# 2    1
# 3    1
# 4    1
# 5    1
# dtype: int64

# Filter rows with any NaN values
rows_with_nan = df[df.isna().any(axis=1)]
print(f"Rows containing NaN: {len(rows_with_nan)}")  # Output: 5

Using value_counts() for Detailed Analysis

The value_counts() method with dropna=False provides a frequency distribution that includes NaN values, useful for categorical analysis.

df_categorical = pd.DataFrame({
    'status': ['active', 'inactive', np.nan, 'active', np.nan, 'pending', 'active'],
    'region': ['North', 'South', 'East', np.nan, 'North', np.nan, 'West']
})

# Count including NaN values
status_counts = df_categorical['status'].value_counts(dropna=False)
print(status_counts)
# Output:
# active      3
# NaN         2
# inactive    1
# pending     1
# Name: status, dtype: int64

# Extract just the NaN count
nan_count = status_counts.get(np.nan, 0)
print(f"NaN count: {nan_count}")  # Output: 2

Handling Different Missing Value Types

Pandas uses different representations for missing data depending on the dtype. Understanding these distinctions prevents counting errors.

# Traditional float column with np.nan
float_col = pd.Series([1.0, np.nan, 3.0, None])
print(f"Float NaN count: {float_col.isna().sum()}")  # Output: 2

# Nullable integer dtype with pd.NA
int_col = pd.Series([1, 2, pd.NA, 4], dtype='Int64')
print(f"Int64 NaN count: {int_col.isna().sum()}")  # Output: 1

# String dtype with pd.NA
str_col = pd.Series(['a', 'b', pd.NA, 'd'], dtype='string')
print(f"String NaN count: {str_col.isna().sum()}")  # Output: 1

# Mixed object column
obj_col = pd.Series([1, 'text', None, np.nan, pd.NA])
print(f"Object NaN count: {obj_col.isna().sum()}")  # Output: 3

For datasets with specific placeholder values representing missing data:

df_placeholders = pd.DataFrame({
    'temperature': [25.0, -999, 30.0, -999, 28.0],
    'humidity': [60, 0, 65, 70, 0]
})

# Replace placeholders with NaN
df_clean = df_placeholders.replace({
    'temperature': {-999: np.nan},
    'humidity': {0: np.nan}
})

print(df_clean.isna().sum())
# Output:
# temperature    2
# humidity       2
# dtype: int64

GroupBy Operations with NaN Counts

Analyzing missing data patterns across categorical groups reveals data quality issues in specific segments.

df_grouped = pd.DataFrame({
    'category': ['A', 'A', 'B', 'B', 'C', 'C', 'A', 'B'],
    'value1': [10, np.nan, 20, 30, np.nan, 40, 50, np.nan],
    'value2': [100, 200, np.nan, 300, 400, np.nan, 500, 600]
})

# Count NaN values by group
nan_by_group = df_grouped.groupby('category').apply(
    lambda x: x.isna().sum()
)
print(nan_by_group)
# Output:
#          category  value1  value2
# category                         
# A               0       1       0
# B               0       2       1
# C               0       1       1

# Calculate NaN percentage by group
def group_nan_percentage(group):
    return (group.isna().sum() / len(group) * 100).round(2)

nan_pct_by_group = df_grouped.groupby('category').apply(group_nan_percentage)
print(nan_pct_by_group)

Performance Considerations for Large Datasets

When working with large DataFrames, choose the most efficient method for your use case.

import time

# Create large DataFrame
large_df = pd.DataFrame({
    'col1': np.random.choice([1, 2, np.nan], size=1_000_000),
    'col2': np.random.choice([10, 20, np.nan], size=1_000_000)
})

# Method 1: isna().sum()
start = time.time()
count1 = large_df['col1'].isna().sum()
time1 = time.time() - start

# Method 2: pd.isna().sum()
start = time.time()
count2 = pd.isna(large_df['col1']).sum()
time2 = time.time() - start

print(f"Method 1 time: {time1:.4f}s, Method 2 time: {time2:.4f}s")

# For multiple columns, use vectorized operations
start = time.time()
all_counts = large_df.isna().sum()
time_vectorized = time.time() - start
print(f"Vectorized time for all columns: {time_vectorized:.4f}s")

Practical Missing Data Report

Combine these techniques into a reusable function for comprehensive missing data analysis:

def missing_data_report(df, threshold=50):
    """
    Generate comprehensive missing data report.
    
    Parameters:
    - df: DataFrame to analyze
    - threshold: Flag columns with NaN percentage above this value
    """
    total_rows = len(df)
    
    report = pd.DataFrame({
        'dtype': df.dtypes,
        'non_null': df.notna().sum(),
        'null_count': df.isna().sum(),
        'null_pct': (df.isna().sum() / total_rows * 100).round(2)
    })
    
    report['flag'] = report['null_pct'] > threshold
    report = report.sort_values('null_pct', ascending=False)
    
    print(f"Dataset Shape: {df.shape}")
    print(f"Total Rows: {total_rows}")
    print(f"\nColumns with >{threshold}% missing data:")
    print(report[report['flag']])
    
    return report

# Usage
df_test = pd.DataFrame({
    'id': range(100),
    'score': np.random.choice([1, 2, 3, np.nan], 100, p=[0.3, 0.3, 0.2, 0.2]),
    'grade': np.random.choice(['A', 'B', np.nan], 100, p=[0.4, 0.2, 0.4])
})

report = missing_data_report(df_test, threshold=30)

This approach provides immediate insight into data quality issues and guides data cleaning strategies based on the extent and distribution of missing values across your dataset.

Liked this? There's more.

Every week: one practical technique, explained simply, with code you can use immediately.