Pandas - Count NaN/NULL Values in Column
• Pandas provides multiple methods to count NaN values including `isna()`, `isnull()`, and `value_counts(dropna=False)`, each suited for different use cases and performance requirements.
Key Insights
• Pandas provides multiple methods to count NaN values including isna(), isnull(), and value_counts(dropna=False), each suited for different use cases and performance requirements.
• Understanding the distinction between NaN, None, and pd.NA is critical when working with missing data across different dtype contexts including nullable integer and string types.
• Combining NaN counting techniques with groupby operations and visualization enables comprehensive missing data analysis across entire datasets.
Basic NaN Counting Methods
The most straightforward approach to count NaN values in a Pandas column uses the isna() method combined with sum(). Since isna() returns a boolean Series where True indicates missing values, summing converts True to 1 and False to 0.
import pandas as pd
import numpy as np
# Create sample DataFrame
df = pd.DataFrame({
'product': ['A', 'B', 'C', 'D', 'E', 'F'],
'price': [10.5, np.nan, 15.0, np.nan, 12.5, np.nan],
'quantity': [100, 200, np.nan, 150, np.nan, 175]
})
# Count NaN values in a single column
nan_count = df['price'].isna().sum()
print(f"NaN count in price: {nan_count}") # Output: 3
# Alternative using isnull() - functionally identical
nan_count_alt = df['price'].isnull().sum()
print(f"NaN count (isnull): {nan_count_alt}") # Output: 3
To get both counts and percentages simultaneously:
def nan_summary(series):
total = len(series)
nan_count = series.isna().sum()
nan_percentage = (nan_count / total) * 100
return {
'total': total,
'nan_count': nan_count,
'valid_count': total - nan_count,
'nan_percentage': f"{nan_percentage:.2f}%"
}
print(nan_summary(df['price']))
# Output: {'total': 6, 'nan_count': 3, 'valid_count': 3, 'nan_percentage': '50.00%'}
Counting Across Multiple Columns
When analyzing missing data patterns across an entire DataFrame, apply aggregation methods that operate on all columns simultaneously.
# Count NaN values for all columns
nan_counts = df.isna().sum()
print(nan_counts)
# Output:
# product 0
# price 3
# quantity 2
# dtype: int64
# Get percentage of NaN values per column
nan_percentages = (df.isna().sum() / len(df)) * 100
print(nan_percentages)
# Output:
# product 0.0
# price 50.0
# quantity 33.333333
# dtype: float64
# Create comprehensive summary
summary = pd.DataFrame({
'total_rows': len(df),
'nan_count': df.isna().sum(),
'valid_count': df.notna().sum(),
'nan_percentage': (df.isna().sum() / len(df) * 100).round(2)
})
print(summary)
For row-wise analysis to identify records with missing data:
# Count NaN values per row
nan_per_row = df.isna().sum(axis=1)
print(nan_per_row)
# Output:
# 0 0
# 1 1
# 2 1
# 3 1
# 4 1
# 5 1
# dtype: int64
# Filter rows with any NaN values
rows_with_nan = df[df.isna().any(axis=1)]
print(f"Rows containing NaN: {len(rows_with_nan)}") # Output: 5
Using value_counts() for Detailed Analysis
The value_counts() method with dropna=False provides a frequency distribution that includes NaN values, useful for categorical analysis.
df_categorical = pd.DataFrame({
'status': ['active', 'inactive', np.nan, 'active', np.nan, 'pending', 'active'],
'region': ['North', 'South', 'East', np.nan, 'North', np.nan, 'West']
})
# Count including NaN values
status_counts = df_categorical['status'].value_counts(dropna=False)
print(status_counts)
# Output:
# active 3
# NaN 2
# inactive 1
# pending 1
# Name: status, dtype: int64
# Extract just the NaN count
nan_count = status_counts.get(np.nan, 0)
print(f"NaN count: {nan_count}") # Output: 2
Handling Different Missing Value Types
Pandas uses different representations for missing data depending on the dtype. Understanding these distinctions prevents counting errors.
# Traditional float column with np.nan
float_col = pd.Series([1.0, np.nan, 3.0, None])
print(f"Float NaN count: {float_col.isna().sum()}") # Output: 2
# Nullable integer dtype with pd.NA
int_col = pd.Series([1, 2, pd.NA, 4], dtype='Int64')
print(f"Int64 NaN count: {int_col.isna().sum()}") # Output: 1
# String dtype with pd.NA
str_col = pd.Series(['a', 'b', pd.NA, 'd'], dtype='string')
print(f"String NaN count: {str_col.isna().sum()}") # Output: 1
# Mixed object column
obj_col = pd.Series([1, 'text', None, np.nan, pd.NA])
print(f"Object NaN count: {obj_col.isna().sum()}") # Output: 3
For datasets with specific placeholder values representing missing data:
df_placeholders = pd.DataFrame({
'temperature': [25.0, -999, 30.0, -999, 28.0],
'humidity': [60, 0, 65, 70, 0]
})
# Replace placeholders with NaN
df_clean = df_placeholders.replace({
'temperature': {-999: np.nan},
'humidity': {0: np.nan}
})
print(df_clean.isna().sum())
# Output:
# temperature 2
# humidity 2
# dtype: int64
GroupBy Operations with NaN Counts
Analyzing missing data patterns across categorical groups reveals data quality issues in specific segments.
df_grouped = pd.DataFrame({
'category': ['A', 'A', 'B', 'B', 'C', 'C', 'A', 'B'],
'value1': [10, np.nan, 20, 30, np.nan, 40, 50, np.nan],
'value2': [100, 200, np.nan, 300, 400, np.nan, 500, 600]
})
# Count NaN values by group
nan_by_group = df_grouped.groupby('category').apply(
lambda x: x.isna().sum()
)
print(nan_by_group)
# Output:
# category value1 value2
# category
# A 0 1 0
# B 0 2 1
# C 0 1 1
# Calculate NaN percentage by group
def group_nan_percentage(group):
return (group.isna().sum() / len(group) * 100).round(2)
nan_pct_by_group = df_grouped.groupby('category').apply(group_nan_percentage)
print(nan_pct_by_group)
Performance Considerations for Large Datasets
When working with large DataFrames, choose the most efficient method for your use case.
import time
# Create large DataFrame
large_df = pd.DataFrame({
'col1': np.random.choice([1, 2, np.nan], size=1_000_000),
'col2': np.random.choice([10, 20, np.nan], size=1_000_000)
})
# Method 1: isna().sum()
start = time.time()
count1 = large_df['col1'].isna().sum()
time1 = time.time() - start
# Method 2: pd.isna().sum()
start = time.time()
count2 = pd.isna(large_df['col1']).sum()
time2 = time.time() - start
print(f"Method 1 time: {time1:.4f}s, Method 2 time: {time2:.4f}s")
# For multiple columns, use vectorized operations
start = time.time()
all_counts = large_df.isna().sum()
time_vectorized = time.time() - start
print(f"Vectorized time for all columns: {time_vectorized:.4f}s")
Practical Missing Data Report
Combine these techniques into a reusable function for comprehensive missing data analysis:
def missing_data_report(df, threshold=50):
"""
Generate comprehensive missing data report.
Parameters:
- df: DataFrame to analyze
- threshold: Flag columns with NaN percentage above this value
"""
total_rows = len(df)
report = pd.DataFrame({
'dtype': df.dtypes,
'non_null': df.notna().sum(),
'null_count': df.isna().sum(),
'null_pct': (df.isna().sum() / total_rows * 100).round(2)
})
report['flag'] = report['null_pct'] > threshold
report = report.sort_values('null_pct', ascending=False)
print(f"Dataset Shape: {df.shape}")
print(f"Total Rows: {total_rows}")
print(f"\nColumns with >{threshold}% missing data:")
print(report[report['flag']])
return report
# Usage
df_test = pd.DataFrame({
'id': range(100),
'score': np.random.choice([1, 2, 3, np.nan], 100, p=[0.3, 0.3, 0.2, 0.2]),
'grade': np.random.choice(['A', 'B', np.nan], 100, p=[0.4, 0.2, 0.4])
})
report = missing_data_report(df_test, threshold=30)
This approach provides immediate insight into data quality issues and guides data cleaning strategies based on the extent and distribution of missing values across your dataset.