R tidyr - fill() - Fill Missing Values

Key Insights

The fill() function propagates non-missing values forward or backward within columns, essential for cleaning datasets with gaps in repeated measurements or hierarchical data
Direction control ("down", "up", "downup", "updown") determines how values propagate, with "down" being the default and most common for time-series and grouped data
Works seamlessly with group_by() to fill values within specific categories, preventing value leakage across logical data boundaries

Understanding the fill() Function

The fill() function from tidyr addresses a common data cleaning challenge: missing values that should logically carry forward from previous observations. This occurs frequently in spreadsheet-style data where category labels appear only at the first occurrence, or in time-series data with intermittent measurements.

library(tidyr)
library(dplyr)

# Basic dataset with missing values
sales_data <- tibble(
  region = c("North", NA, NA, "South", NA, "West"),
  quarter = c("Q1", "Q2", "Q3", "Q1", "Q2", "Q1"),
  revenue = c(100, 150, 200, 120, 180, 90)
)

print(sales_data)
#   region quarter revenue
#   <chr>  <chr>     <dbl>
# 1 North  Q1          100
# 2 NA     Q2          150
# 3 NA     Q3          200
# 4 South  Q1          120
# 5 NA     Q2          180
# 6 West   Q1           90

# Fill down the region column
filled_data <- sales_data %>%
  fill(region)

print(filled_data)
#   region quarter revenue
#   <chr>  <chr>     <dbl>
# 1 North  Q1          100
# 2 North  Q2          150
# 3 North  Q3          200
# 4 South  Q1          120
# 5 South  Q2          180
# 6 West   Q1           90

Direction Control

The .direction parameter controls how values propagate through missing cells. Four options provide flexibility for different data structures.

# Sample data with gaps
measurements <- tibble(
  time = 1:8,
  sensor_a = c(23.5, NA, NA, 24.1, NA, NA, NA, 25.0),
  sensor_b = c(NA, NA, 18.2, NA, 19.1, NA, NA, NA)
)

# Fill down (default)
measurements %>%
  fill(sensor_a, sensor_b, .direction = "down")
#   time sensor_a sensor_b
#  <int>    <dbl>    <dbl>
#1     1     23.5     NA  
#2     2     23.5     NA  
#3     3     23.5     18.2
#4     4     24.1     18.2
#5     5     24.1     19.1
#6     6     24.1     19.1
#7     7     24.1     19.1
#8     8     25.0     19.1

# Fill up
measurements %>%
  fill(sensor_a, sensor_b, .direction = "up")
#   time sensor_a sensor_b
#  <int>    <dbl>    <dbl>
#1     1     23.5     18.2
#2     2     24.1     18.2
#3     3     24.1     18.2
#4     4     24.1     19.1
#5     5     25.0     19.1
#6     6     25.0     NA  
#7     7     25.0     NA  
#8     8     25.0     NA  

# Fill downup (down first, then up)
measurements %>%
  fill(sensor_a, sensor_b, .direction = "downup")
#   time sensor_a sensor_b
#  <int>    <dbl>    <dbl>
#1     1     23.5     18.2
#2     2     23.5     18.2
#3     3     23.5     18.2
#4     4     24.1     18.2
#5     5     24.1     19.1
#6     6     24.1     19.1
#7     7     24.1     19.1
#8     8     25.0     19.1

# Fill updown (up first, then down)
measurements %>%
  fill(sensor_a, sensor_b, .direction = "updown")
#   time sensor_a sensor_b
#  <int>    <dbl>    <dbl>
#1     1     23.5     18.2
#2     2     24.1     18.2
#3     3     24.1     18.2
#4     4     24.1     19.1
#5     5     25.0     19.1
#6     6     25.0     19.1
#7     7     25.0     19.1
#8     8     25.0     19.1

Grouped Operations

Combining fill() with group_by() prevents values from bleeding across logical boundaries in your data. This is critical for maintaining data integrity in multi-category datasets.

# Customer transaction data
transactions <- tibble(
  customer_id = c(1, 1, 1, 2, 2, 3, 3, 3),
  date = as.Date(c("2024-01-01", "2024-01-05", "2024-01-10",
                   "2024-01-02", "2024-01-08",
                   "2024-01-03", "2024-01-07", "2024-01-12")),
  payment_method = c("Credit", NA, NA, "Debit", NA, "Credit", NA, NA),
  amount = c(50, 75, 100, 60, 80, 45, 55, 70)
)

# Without grouping - INCORRECT
transactions %>%
  fill(payment_method)
#   customer_id date       payment_method amount
#         <dbl> <date>     <chr>           <dbl>
# 1           1 2024-01-01 Credit             50
# 2           1 2024-01-05 Credit             75
# 3           1 2024-01-10 Credit            100
# 4           2 2024-01-02 Debit              60
# 5           2 2024-01-08 Debit              80
# 6           3 2024-01-03 Credit             45
# 7           3 2024-01-07 Credit             55
# 8           3 2024-01-12 Credit             70

# With grouping - CORRECT
transactions %>%
  group_by(customer_id) %>%
  fill(payment_method) %>%
  ungroup()
#   customer_id date       payment_method amount
#         <dbl> <date>     <chr>           <dbl>
# 1           1 2024-01-01 Credit             50
# 2           1 2024-01-05 Credit             75
# 3           1 2024-01-10 Credit            100
# 4           2 2024-01-02 Debit              60
# 5           2 2024-01-08 Debit              80
# 6           3 2024-01-03 Credit             45
# 7           3 2024-01-07 Credit             55
# 8           3 2024-01-12 Credit             70

Multiple Column Selection

The fill() function accepts multiple columns and tidy selection helpers, making it efficient for wide datasets.

# Wide format sensor data
sensor_readings <- tibble(
  timestamp = 1:6,
  temp_1 = c(20.1, NA, 20.3, NA, NA, 20.8),
  temp_2 = c(19.8, NA, NA, 20.1, NA, 20.5),
  humidity_1 = c(65, NA, NA, 68, NA, 70),
  humidity_2 = c(64, 66, NA, NA, 69, NA),
  pressure = c(1013, NA, 1014, NA, NA, 1015)
)

# Fill all columns starting with "temp"
sensor_readings %>%
  fill(starts_with("temp"), .direction = "down")

# Fill all numeric columns
sensor_readings %>%
  fill(where(is.numeric), .direction = "down")

# Fill specific columns
sensor_readings %>%
  fill(temp_1, temp_2, humidity_1, humidity_2, .direction = "down")
#   timestamp temp_1 temp_2 humidity_1 humidity_2 pressure
#       <int>  <dbl>  <dbl>      <dbl>      <dbl>    <dbl>
# 1         1   20.1   19.8         65         64     1013
# 2         2   20.1   19.8         65         66       NA
# 3         3   20.3   19.8         65         66     1014
# 4         4   20.3   20.1         68         66       NA
# 5         5   20.3   20.1         68         69       NA
# 6         6   20.8   20.5         70         69     1015

Handling Leading NAs

When missing values appear at the beginning of a column, fill() behavior depends on direction. Leading NAs remain when filling down, but get filled when using up or bidirectional filling.

# Data with leading NAs
product_data <- tibble(
  category = c(NA, NA, "Electronics", NA, "Furniture", NA),
  product = c("Item A", "Item B", "Item C", "Item D", "Item E", "Item F"),
  price = c(100, 150, 200, 180, 300, 250)
)

# Fill down - leading NAs remain
product_data %>%
  fill(category, .direction = "down")
#   category    product price
#   <chr>       <chr>   <dbl>
# 1 NA          Item A    100
# 2 NA          Item B    150
# 3 Electronics Item C    200
# 4 Electronics Item D    180
# 5 Furniture   Item E    300
# 6 Furniture   Item F    250

# Fill up - leading NAs get filled
product_data %>%
  fill(category, .direction = "up")
#   category    product price
#   <chr>       <chr>   <dbl>
# 1 Electronics Item A    100
# 2 Electronics Item B    150
# 3 Electronics Item C    200
# 4 Furniture   Item D    180
# 5 Furniture   Item E    300
# 6 NA          Item F    250

# Fill downup - all internal NAs filled
product_data %>%
  fill(category, .direction = "downup")
#   category    product price
#   <chr>       <chr>   <dbl>
# 1 Electronics Item A    100
# 2 Electronics Item B    150
# 3 Electronics Item C    200
# 4 Electronics Item D    180
# 5 Furniture   Item E    300
# 6 Furniture   Item F    250

Real-World Pipeline Example

Here’s a complete data cleaning pipeline combining fill() with other tidyr and dplyr functions.

# Raw survey data with hierarchical structure
survey_raw <- tibble(
  section = c("Demographics", NA, NA, "Preferences", NA, NA, "Feedback", NA),
  question_id = c("D1", "D2", "D3", "P1", "P2", "P3", "F1", "F2"),
  question = c("Age", "Gender", "Location", "Color", "Size", "Style", 
               "Rating", "Comments"),
  response = c("25-34", NA, "Urban", "Blue", NA, "Modern", "4", NA)
)

# Complete cleaning pipeline
survey_clean <- survey_raw %>%
  # Fill section names
  fill(section, .direction = "down") %>%
  # Create section groups
  group_by(section) %>%
  # Fill responses within sections only
  fill(response, .direction = "down") %>%
  ungroup() %>%
  # Remove rows with no response data
  filter(!is.na(response)) %>%
  # Add computed columns
  mutate(
    question_full = paste(section, question, sep = " - "),
    has_response = !is.na(response)
  )

print(survey_clean)
#   section      question_id question question_full            response has_response
#   <chr>        <chr>       <chr>    <chr>                    <chr>    <lgl>       
# 1 Demographics D1          Age      Demographics - Age       25-34    TRUE        
# 2 Demographics D2          Gender   Demographics - Gender    25-34    TRUE        
# 3 Demographics D3          Location Demographics - Location  Urban    TRUE        
# 4 Preferences  P1          Color    Preferences - Color      Blue     TRUE        
# 5 Preferences  P2          Size     Preferences - Size       Blue     TRUE        
# 6 Preferences  P3          Style    Preferences - Style      Modern   TRUE        
# 7 Feedback     F1          Rating   Feedback - Rating        4        TRUE

The fill() function streamlines data preparation by handling missing value propagation logically and efficiently. Combined with grouping operations and tidy selection, it becomes an essential tool for transforming messy real-world data into analysis-ready formats.