R - Mean, Median, Mode Calculation | Application Architect

Key Insights

R provides built-in functions for mean and median calculations, but requires custom implementation for mode due to its statistical complexity and multiple possible interpretations
Understanding when to use each measure of central tendency depends on data distribution—mean for normal distributions, median for skewed data, and mode for categorical or multimodal datasets
Vectorized operations and proper handling of NA values are critical for production-ready statistical calculations in R

Basic Mean Calculation

R’s mean() function calculates the arithmetic average of numeric vectors. The function handles NA values through the na.rm parameter, essential for real-world datasets with missing data.

# Basic mean calculation
numbers <- c(10, 20, 30, 40, 50)
result <- mean(numbers)
print(result)  # Output: 30

# Handling missing values
numbers_with_na <- c(10, 20, NA, 40, 50)
mean(numbers_with_na)  # Returns NA
mean(numbers_with_na, na.rm = TRUE)  # Returns 30

# Weighted mean
values <- c(85, 90, 78, 92)
weights <- c(0.2, 0.3, 0.25, 0.25)
weighted_mean <- weighted.mean(values, weights)
print(weighted_mean)  # Output: 86.95

For data frames, apply mean calculations across columns or rows using colMeans() and rowMeans() for better performance than apply().

# Create sample data frame
df <- data.frame(
  score1 = c(85, 90, 78, 92, 88),
  score2 = c(88, 85, 82, 95, 90),
  score3 = c(90, 88, 80, 93, 87)
)

# Column means (average score per test)
col_means <- colMeans(df)
print(col_means)

# Row means (average score per student)
row_means <- rowMeans(df)
print(row_means)

# Handle NA values in data frames
df_with_na <- df
df_with_na[2, 2] <- NA
colMeans(df_with_na, na.rm = TRUE)

Median Calculation

The median represents the middle value in a sorted dataset, making it robust against outliers. R’s median() function automatically sorts data before calculation.

# Basic median
values <- c(10, 20, 30, 40, 50)
median(values)  # Output: 30

# Even number of observations (returns average of middle two)
even_values <- c(10, 20, 30, 40)
median(even_values)  # Output: 25

# Median with outliers
data_with_outliers <- c(10, 12, 14, 15, 16, 100)
mean(data_with_outliers)    # 27.83 (affected by outlier)
median(data_with_outliers)  # 14.5 (robust to outlier)

# Median for grouped data
library(dplyr)

sales_data <- data.frame(
  region = c("North", "North", "South", "South", "East", "East"),
  revenue = c(100, 150, 200, 180, 120, 140)
)

sales_data %>%
  group_by(region) %>%
  summarise(
    mean_revenue = mean(revenue),
    median_revenue = median(revenue)
  )

Mode Implementation

R lacks a built-in mode function because mode has multiple statistical interpretations. Implement custom functions based on your requirements.

# Simple mode function (returns most frequent value)
calculate_mode <- function(x) {
  uniq_x <- unique(x)
  uniq_x[which.max(tabulate(match(x, uniq_x)))]
}

# Test with numeric data
numbers <- c(1, 2, 2, 3, 3, 3, 4, 4)
calculate_mode(numbers)  # Output: 3

# Test with character data
categories <- c("A", "B", "B", "C", "C", "C", "D")
calculate_mode(categories)  # Output: "C"

For multimodal distributions (multiple modes), implement a function that returns all modes:

# Function to find all modes
find_all_modes <- function(x) {
  freq_table <- table(x)
  max_freq <- max(freq_table)
  modes <- names(freq_table[freq_table == max_freq])
  
  # Convert back to original type
  if (is.numeric(x)) {
    return(as.numeric(modes))
  } else {
    return(modes)
  }
}

# Bimodal distribution
bimodal <- c(1, 1, 1, 2, 3, 4, 4, 4, 5)
find_all_modes(bimodal)  # Output: 1 4

# No mode (all values equally frequent)
uniform <- c(1, 2, 3, 4, 5)
find_all_modes(uniform)  # Output: 1 2 3 4 5

Advanced mode calculation with frequency threshold:

# Mode with minimum frequency requirement
mode_with_threshold <- function(x, min_freq = 2) {
  freq_table <- table(x)
  max_freq <- max(freq_table)
  
  if (max_freq < min_freq) {
    return(NULL)  # No mode meets threshold
  }
  
  modes <- names(freq_table[freq_table == max_freq])
  
  if (is.numeric(x)) {
    return(as.numeric(modes))
  } else {
    return(modes)
  }
}

# Test with threshold
sparse_data <- c(1, 2, 3, 4, 5, 6)
mode_with_threshold(sparse_data, min_freq = 2)  # NULL

dense_data <- c(1, 1, 2, 3, 4, 5, 6)
mode_with_threshold(dense_data, min_freq = 2)  # 1

Practical Application: Data Analysis Pipeline

Combine all three measures for comprehensive data analysis:

# Complete statistical summary function
statistical_summary <- function(x, na.rm = TRUE) {
  if (!is.numeric(x)) {
    stop("Input must be numeric")
  }
  
  if (na.rm) {
    x <- x[!is.na(x)]
  }
  
  list(
    mean = mean(x),
    median = median(x),
    mode = calculate_mode(x),
    std_dev = sd(x),
    min = min(x),
    max = max(x),
    n = length(x),
    na_count = sum(is.na(x))
  )
}

# Apply to real dataset
set.seed(123)
sample_data <- c(rnorm(100, mean = 50, sd = 10), rep(55, 10))
summary_stats <- statistical_summary(sample_data)
print(summary_stats)

Analyze multiple columns efficiently:

# Multi-column analysis
analyze_dataframe <- function(df) {
  numeric_cols <- sapply(df, is.numeric)
  results <- data.frame(
    column = names(df)[numeric_cols],
    mean = sapply(df[numeric_cols], mean, na.rm = TRUE),
    median = sapply(df[numeric_cols], median, na.rm = TRUE),
    mode = sapply(df[numeric_cols], calculate_mode),
    row.names = NULL
  )
  return(results)
}

# Example usage
test_df <- data.frame(
  age = c(25, 30, 35, 30, 40, 30),
  salary = c(50000, 60000, 70000, 60000, 80000, 60000),
  score = c(85, 90, 85, 95, 85, 100)
)

analysis_results <- analyze_dataframe(test_df)
print(analysis_results)

Performance Considerations

For large datasets, vectorized operations significantly outperform loops:

# Benchmark comparison
library(microbenchmark)

large_vector <- rnorm(1000000)

microbenchmark(
  base_mean = mean(large_vector),
  manual_mean = sum(large_vector) / length(large_vector),
  times = 100
)

# Efficient mode calculation for large datasets
efficient_mode <- function(x) {
  # Use data.table for better performance
  freq <- sort(table(x), decreasing = TRUE)
  as.numeric(names(freq)[1])
}

Handling Edge Cases

Production code must handle edge cases gracefully:

robust_central_tendency <- function(x) {
  # Check for empty vector
  if (length(x) == 0) {
    return(list(mean = NA, median = NA, mode = NA))
  }
  
  # Remove NA values
  x_clean <- x[!is.na(x)]
  
  # Check if all values are NA
  if (length(x_clean) == 0) {
    return(list(mean = NA, median = NA, mode = NA))
  }
  
  # Calculate metrics
  list(
    mean = mean(x_clean),
    median = median(x_clean),
    mode = calculate_mode(x_clean),
    valid_n = length(x_clean),
    na_proportion = sum(is.na(x)) / length(x)
  )
}

# Test edge cases
robust_central_tendency(c())  # Empty vector
robust_central_tendency(c(NA, NA, NA))  # All NA
robust_central_tendency(c(1, NA, 2, NA, 3))  # Mixed

These implementations provide production-ready solutions for calculating central tendency measures in R. Choose mean for symmetric distributions, median for skewed data or when outliers exist, and mode for categorical analysis or identifying the most common value in your dataset.