NumPy - np.isnan() and np.isinf() | Application Architect

Key Insights

• np.isnan() and np.isinf() provide vectorized operations for detecting NaN and infinity values in NumPy arrays, significantly faster than Python’s built-in math.isnan() and math.isinf() for array operations • These functions return boolean arrays of the same shape as the input, enabling element-wise masking, filtering, and replacement operations critical for data cleaning pipelines • Combining these functions with np.isfinite() and boolean indexing creates robust data validation workflows that handle edge cases in numerical computations

Understanding NaN and Infinity in NumPy

NumPy represents special floating-point values according to IEEE 754 standards. NaN (Not a Number) results from undefined operations like 0/0 or inf - inf, while infinity (inf) represents values exceeding floating-point limits. These values propagate through calculations and require explicit handling.

import numpy as np

# Creating special values
nan_value = np.nan
inf_value = np.inf
neg_inf = -np.inf

# Operations that produce NaN
result1 = np.array([0.0]) / np.array([0.0])  # [nan]
result2 = np.inf - np.inf  # nan
result3 = np.sqrt(np.array([-1.0]))  # [nan]

# Operations that produce infinity
result4 = np.array([1.0]) / np.array([0.0])  # [inf]
result5 = np.exp(1000)  # inf (overflow)

print(f"NaN: {result1[0]}")
print(f"Infinity: {result4[0]}")

Basic Usage of np.isnan()

np.isnan() returns True for NaN values and False otherwise. It operates element-wise on arrays, making it efficient for large datasets.

import numpy as np

# Single value check
value = np.nan
print(np.isnan(value))  # True

# Array operations
data = np.array([1.0, np.nan, 3.0, np.inf, np.nan, -5.0])
nan_mask = np.isnan(data)
print(nan_mask)  # [False  True False False  True False]

# Count NaN values
nan_count = np.sum(nan_mask)
print(f"NaN count: {nan_count}")  # NaN count: 2

# Get indices of NaN values
nan_indices = np.where(nan_mask)[0]
print(f"NaN at indices: {nan_indices}")  # [1 4]

Basic Usage of np.isinf()

np.isinf() detects both positive and negative infinity. Use np.isposinf() and np.isneginf() for specific infinity signs.

import numpy as np

data = np.array([1.0, np.inf, -np.inf, 0.0, np.nan])

# Check for any infinity
inf_mask = np.isinf(data)
print(inf_mask)  # [False  True  True False False]

# Check for positive infinity only
pos_inf_mask = np.isposinf(data)
print(pos_inf_mask)  # [False  True False False False]

# Check for negative infinity only
neg_inf_mask = np.isneginf(data)
print(neg_inf_mask)  # [False False  True False False]

# Combine checks
special_values = np.logical_or(np.isnan(data), np.isinf(data))
print(f"Special values: {np.sum(special_values)}")  # 3

Data Cleaning with Boolean Indexing

Boolean arrays from these functions enable powerful filtering and replacement operations.

import numpy as np

# Sample dataset with problematic values
measurements = np.array([23.5, np.nan, 45.2, np.inf, 12.8, -np.inf, 67.3, np.nan])

# Remove NaN and inf values
clean_data = measurements[np.isfinite(measurements)]
print(f"Clean data: {clean_data}")  # [23.5 45.2 12.8 67.3]

# Replace NaN with mean of valid values
valid_mask = np.isfinite(measurements)
mean_value = np.mean(measurements[valid_mask])
cleaned = measurements.copy()
cleaned[np.isnan(cleaned)] = mean_value
print(f"NaN replaced: {cleaned}")

# Replace infinity with boundary values
cleaned[np.isposinf(cleaned)] = np.finfo(np.float64).max
cleaned[np.isneginf(cleaned)] = np.finfo(np.float64).min
print(f"Fully cleaned: {cleaned}")

Working with Multidimensional Arrays

These functions preserve array dimensionality, enabling row-wise or column-wise operations.

import numpy as np

# 2D array with special values
matrix = np.array([
    [1.0, 2.0, np.nan, 4.0],
    [5.0, np.inf, 7.0, 8.0],
    [9.0, 10.0, 11.0, np.nan],
    [np.inf, 14.0, 15.0, 16.0]
])

# Find rows containing NaN
rows_with_nan = np.any(np.isnan(matrix), axis=1)
print(f"Rows with NaN: {np.where(rows_with_nan)[0]}")  # [0 2]

# Find columns containing infinity
cols_with_inf = np.any(np.isinf(matrix), axis=0)
print(f"Columns with inf: {np.where(cols_with_inf)[0]}")  # [0 1]

# Count special values per row
special_per_row = np.sum(~np.isfinite(matrix), axis=1)
print(f"Special values per row: {special_per_row}")  # [1 1 1 1]

# Remove rows with any special values
clean_rows = matrix[np.all(np.isfinite(matrix), axis=1)]
print(f"Clean rows shape: {clean_rows.shape}")

Performance Comparison

NumPy’s vectorized operations significantly outperform Python’s built-in functions for array operations.

import numpy as np
import math
import time

# Large dataset
data = np.random.randn(1000000)
data[::1000] = np.nan  # Insert some NaN values

# NumPy approach
start = time.time()
nan_mask = np.isnan(data)
numpy_time = time.time() - start

# Python built-in approach (slower)
start = time.time()
python_mask = [math.isnan(x) for x in data]
python_time = time.time() - start

print(f"NumPy time: {numpy_time:.4f}s")
print(f"Python time: {python_time:.4f}s")
print(f"Speedup: {python_time/numpy_time:.1f}x")

Practical Data Validation Pipeline

Combine these functions to build robust validation workflows for real-world data processing.

import numpy as np

def validate_and_clean(data, strategy='remove', fill_value=0):
    """
    Validate and clean numerical data.
    
    Parameters:
    - data: numpy array
    - strategy: 'remove', 'fill', or 'boundary'
    - fill_value: value for 'fill' strategy
    """
    stats = {
        'total': len(data),
        'nan_count': np.sum(np.isnan(data)),
        'inf_count': np.sum(np.isinf(data)),
        'valid_count': np.sum(np.isfinite(data))
    }
    
    if strategy == 'remove':
        cleaned = data[np.isfinite(data)]
    elif strategy == 'fill':
        cleaned = data.copy()
        invalid_mask = ~np.isfinite(cleaned)
        cleaned[invalid_mask] = fill_value
    elif strategy == 'boundary':
        cleaned = data.copy()
        cleaned[np.isnan(cleaned)] = 0
        cleaned[np.isposinf(cleaned)] = np.finfo(data.dtype).max
        cleaned[np.isneginf(cleaned)] = np.finfo(data.dtype).min
    else:
        raise ValueError(f"Unknown strategy: {strategy}")
    
    return cleaned, stats

# Example usage
raw_data = np.array([1.0, 2.0, np.nan, np.inf, 5.0, -np.inf, 7.0])

cleaned, stats = validate_and_clean(raw_data, strategy='fill', fill_value=0)
print(f"Statistics: {stats}")
print(f"Cleaned data: {cleaned}")

# Validation for machine learning pipelines
def check_data_quality(X, threshold=0.1):
    """Check if data quality meets threshold for ML."""
    total_elements = X.size
    invalid_elements = np.sum(~np.isfinite(X))
    invalid_ratio = invalid_elements / total_elements
    
    if invalid_ratio > threshold:
        raise ValueError(
            f"Data quality too low: {invalid_ratio:.2%} invalid values "
            f"(threshold: {threshold:.2%})"
        )
    return True

# Test quality check
test_data = np.random.randn(100, 10)
test_data[0:5, 0] = np.nan  # 5% invalid

try:
    check_data_quality(test_data, threshold=0.1)
    print("Data quality check passed")
except ValueError as e:
    print(f"Quality check failed: {e}")

Edge Cases and Considerations

Handle special scenarios where these functions behave differently than expected.

import numpy as np

# Integer arrays never contain NaN or inf (in standard NumPy)
int_array = np.array([1, 2, 3], dtype=np.int32)
print(np.isnan(int_array))  # [False False False]

# Complex numbers
complex_array = np.array([1+2j, np.nan+3j, 4+np.inf*1j])
print(np.isnan(complex_array))  # [False  True False]
print(np.isinf(complex_array))  # [False False  True]

# Structured arrays require field access
structured = np.array(
    [(1.0, np.nan), (np.inf, 3.0)],
    dtype=[('a', 'f8'), ('b', 'f8')]
)
print(np.isnan(structured['a']))  # [False  True]
print(np.isinf(structured['b']))  # [False False]

# Empty arrays
empty = np.array([])
print(np.isnan(empty))  # []
print(np.any(np.isnan(empty)))  # False

These functions form the foundation of numerical data validation in NumPy. Use them to ensure data integrity before statistical analysis, machine learning model training, or numerical simulations. The combination of vectorized performance and intuitive boolean indexing makes them essential tools for production data pipelines.