R tidyr - nest() and unnest() | Application Architect

Key Insights

nest() transforms grouped data into list-columns, creating hierarchical data structures where each group becomes a tibble stored in a single row
unnest() expands list-columns back into regular columns, with options to control how multiple list-columns interact and handle missing values
Nesting enables powerful workflows like fitting models per group, storing complex objects in data frames, and managing JSON-like hierarchical data structures

Understanding List-Columns

List-columns are the foundation of tidyr’s nesting capabilities. Unlike typical data frame columns that contain atomic vectors (numeric, character, logical), list-columns contain lists where each element can be any R object—including entire data frames.

library(tidyr)
library(dplyr)

# Simple list-column example
df <- tibble(
  id = 1:3,
  data = list(
    c(1, 2, 3),
    c(4, 5),
    c(6, 7, 8, 9)
  )
)

print(df)
#> # A tibble: 3 × 2
#>      id data     
#>   <int> <list>   
#> 1     1 <dbl [3]>
#> 2     2 <dbl [2]>
#> 3     3 <dbl [4]>

This structure allows storing variable-length data or complex objects alongside scalar values, enabling sophisticated data manipulation patterns.

Basic Nesting Operations

nest() collapses rows into list-columns based on grouping. The basic syntax takes column specifications indicating which columns to nest.

# Sample dataset
sales <- tibble(
  region = rep(c("North", "South"), each = 4),
  quarter = rep(c("Q1", "Q2", "Q3", "Q4"), 2),
  revenue = c(120, 135, 142, 150, 98, 105, 110, 115),
  costs = c(80, 85, 90, 95, 65, 70, 72, 75)
)

# Nest by region
nested_sales <- sales %>%
  nest(data = c(quarter, revenue, costs))

print(nested_sales)
#> # A tibble: 2 × 2
#>   region data            
#>   <chr>  <list>          
#> 1 North  <tibble [4 × 3]>
#> 2 South  <tibble [4 × 3]>

# Inspect nested data
nested_sales$data[[1]]
#> # A tibble: 4 × 3
#>   quarter revenue costs
#>   <chr>     <dbl> <dbl>
#> 1 Q1          120    80
#> 2 Q2          135    85
#> 3 Q3          142    90
#> 4 Q4          150    95

Alternative nesting approaches provide flexibility:

# Nest everything except region
sales %>% nest(data = -region)

# Using .by for inline grouping (tidyr 1.3.0+)
sales %>% nest(.by = region)

# Nest multiple separate groups
customers <- tibble(
  id = 1:6,
  name = c("Alice", "Bob", "Charlie", "Alice", "Bob", "Charlie"),
  purchase = c(100, 200, 150, 120, 180, 160),
  date = as.Date("2024-01-01") + 0:5
)

customers %>%
  nest(purchases = c(purchase, date), .by = name)

Unnesting Data

unnest() reverses the nesting operation, expanding list-columns back into rows.

# Unnest the nested sales data
nested_sales %>%
  unnest(data)
#> # A tibble: 8 × 4
#>   region quarter revenue costs
#>   <chr>  <chr>     <dbl> <dbl>
#> 1 North  Q1          120    80
#> 2 North  Q2          135    85
#> 3 North  Q3          142    90
#> 4 North  Q4          150    95
#> 5 South  Q1           98    65
#> 6 South  Q2          105    70
#> 7 South  Q3          110    72
#> 8 South  Q4          115    75

When unnesting multiple list-columns, control the expansion behavior:

df_multi <- tibble(
  id = 1:2,
  x = list(c("a", "b"), c("c", "d", "e")),
  y = list(c(1, 2), c(3, 4, 5))
)

# Default: parallel unnesting (must have matching lengths)
df_multi %>% unnest(c(x, y))

# Create combinations with keep_empty
df_crossed <- tibble(
  id = 1:2,
  letters = list(c("a", "b"), c("c")),
  numbers = list(c(1, 2, 3), c(4, 5))
)

# Handle length mismatches
df_crossed %>%
  unnest(letters) %>%
  unnest(numbers)

Practical Pattern: Group-wise Models

Nesting excels at fitting separate models for each group and storing results compactly.

library(purrr)
library(broom)

# Fit linear models per region
model_results <- sales %>%
  nest(.by = region) %>%
  mutate(
    model = map(data, ~lm(revenue ~ costs, data = .x)),
    tidied = map(model, tidy),
    glanced = map(model, glance)
  )

# Extract model coefficients
model_results %>%
  select(region, tidied) %>%
  unnest(tidied)
#> # A tibble: 4 × 6
#>   region term        estimate std.error statistic p.value
#>   <chr>  <chr>          <dbl>     <dbl>     <dbl>   <dbl>
#> 1 North  (Intercept)    30.0      11.0       2.73  0.110 
#> 2 North  costs           1.33     0.123     10.8   0.00847
#> 3 South  (Intercept)    33.0       5.66      5.83  0.0279
#> 4 South  costs           1.14     0.0816    14.0   0.00509

# Extract model statistics
model_results %>%
  select(region, glanced) %>%
  unnest(glanced) %>%
  select(region, r.squared, adj.r.squared, AIC)

Working with JSON-like Structures

Nesting handles hierarchical data similar to JSON documents.

# API response simulation
api_data <- tibble(
  user_id = c(1, 1, 2, 2, 3),
  event_type = c("login", "purchase", "login", "logout", "purchase"),
  timestamp = Sys.time() + 1:5 * 3600,
  metadata = list(
    list(ip = "192.168.1.1", device = "mobile"),
    list(ip = "192.168.1.1", device = "mobile", amount = 99.99),
    list(ip = "10.0.0.1", device = "desktop"),
    list(ip = "10.0.0.1", device = "desktop"),
    list(ip = "172.16.0.1", device = "tablet", amount = 149.99)
  )
)

# Nest events by user
user_sessions <- api_data %>%
  nest(events = c(event_type, timestamp, metadata), .by = user_id)

# Process nested metadata
user_sessions %>%
  unnest(events) %>%
  mutate(
    ip = map_chr(metadata, ~.x$ip),
    device = map_chr(metadata, ~.x$device),
    amount = map_dbl(metadata, ~.x$amount %||% NA_real_)
  ) %>%
  select(-metadata)

Advanced: Nested Iterations

Combine nesting with pmap() for complex multi-step operations.

# Simulate A/B test data
ab_tests <- tibble(
  test_id = rep(1:3, each = 100),
  variant = rep(c("A", "B"), 150),
  conversion = rbinom(300, 1, rep(c(0.05, 0.07, 0.06, 0.08, 0.04, 0.09), each = 50))
)

# Analyze each test
test_results <- ab_tests %>%
  nest(.by = test_id) %>%
  mutate(
    analysis = map(data, function(d) {
      a_data <- filter(d, variant == "A")
      b_data <- filter(d, variant == "B")
      
      test <- prop.test(
        x = c(sum(a_data$conversion), sum(b_data$conversion)),
        n = c(nrow(a_data), nrow(b_data))
      )
      
      tibble(
        variant_a_rate = mean(a_data$conversion),
        variant_b_rate = mean(b_data$conversion),
        p_value = test$p.value,
        significant = test$p.value < 0.05
      )
    })
  )

test_results %>%
  select(test_id, analysis) %>%
  unnest(analysis)

Managing Nested Data Frames

Use helper functions to inspect and manipulate nested structures.

# Check nested structure
nested_sales %>%
  mutate(
    n_rows = map_int(data, nrow),
    n_cols = map_int(data, ncol),
    col_names = map(data, names)
  )

# Filter based on nested content
nested_sales %>%
  filter(map_lgl(data, ~mean(.x$revenue) > 120))

# Modify nested data
nested_sales %>%
  mutate(
    data = map(data, ~mutate(.x, profit = revenue - costs))
  ) %>%
  unnest(data)

# Summarize nested data without unnesting
nested_sales %>%
  mutate(
    summary = map(data, ~summarise(
      .x,
      avg_revenue = mean(revenue),
      total_costs = sum(costs),
      quarters = n()
    ))
  ) %>%
  select(region, summary) %>%
  unnest(summary)

Performance Considerations

Nesting creates memory overhead but enables cleaner code for group-wise operations.

# Instead of split-apply-combine
regions <- unique(sales$region)
results <- lapply(regions, function(r) {
  subset_data <- sales[sales$region == r, ]
  # process subset_data
})

# Use nesting for clarity
sales %>%
  nest(.by = region) %>%
  mutate(result = map(data, process_function))

For large datasets, consider whether nesting is necessary or if group_by() with summarise() suffices. Nesting shines when you need to store complex objects (models, plots, nested tibbles) or perform operations that don’t reduce to simple summaries.