R tidyr - pivot_longer() (Wide to Long)

Key Insights

• pivot_longer() transforms wide-format data into long format by converting column names into values of a new variable, essential for tidy data analysis and visualization in R • The function requires at minimum the cols parameter to specify which columns to pivot, with names_to and values_to controlling the output structure • Advanced features like names_pattern, names_sep, and values_drop_na handle complex column naming schemes and data cleaning during transformation

Understanding Wide vs Long Format

Wide format stores observations across multiple columns, where each variable occupies its own column. Long format stacks these observations vertically, creating fewer columns but more rows. Most tidyverse functions and ggplot2 visualizations expect long format data.

library(tidyr)
library(dplyr)

# Wide format - each year is a separate column
wide_data <- data.frame(
  country = c("USA", "Canada", "Mexico"),
  pop_2020 = c(331, 38, 128),
  pop_2021 = c(332, 38, 130),
  pop_2022 = c(333, 39, 132)
)

print(wide_data)
#   country pop_2020 pop_2021 pop_2022
# 1     USA      331      332      333
# 2  Canada       38       38       39
# 3  Mexico      128      130      132

Basic pivot_longer() Syntax

The fundamental transformation requires specifying which columns to pivot. The cols parameter accepts column names, ranges, or selection helpers.

# Convert to long format
long_data <- wide_data %>%
  pivot_longer(
    cols = pop_2020:pop_2022,
    names_to = "year",
    values_to = "population"
  )

print(long_data)
#   country year     population
# 1 USA     pop_2020        331
# 2 USA     pop_2021        332
# 3 USA     pop_2022        333
# 4 Canada  pop_2020         38
# 5 Canada  pop_2021         38
# 6 Canada  pop_2022         39
# 7 Mexico  pop_2020        128
# 8 Mexico  pop_2021        130
# 9 Mexico  pop_2022        132

Column Selection Methods

pivot_longer() supports multiple selection approaches using tidyselect syntax.

# Select by name
pivot_longer(wide_data, cols = c(pop_2020, pop_2021, pop_2022))

# Select by range
pivot_longer(wide_data, cols = pop_2020:pop_2022)

# Select by position
pivot_longer(wide_data, cols = 2:4)

# Select by pattern
pivot_longer(wide_data, cols = starts_with("pop_"))

# Select everything except certain columns
pivot_longer(wide_data, cols = -country)

# Select by type
sales_data <- data.frame(
  product = c("A", "B"),
  q1 = c(100, 150),
  q2 = c(120, 160),
  category = c("Electronics", "Furniture")
)

sales_data %>%
  pivot_longer(cols = where(is.numeric))

Cleaning Column Names During Pivot

The names_prefix parameter removes common prefixes, while names_transform applies functions to column names.

long_data_clean <- wide_data %>%
  pivot_longer(
    cols = starts_with("pop_"),
    names_to = "year",
    names_prefix = "pop_",
    values_to = "population",
    names_transform = list(year = as.integer)
  )

print(long_data_clean)
#   country year population
# 1 USA     2020        331
# 2 USA     2021        332
# 3 USA     2022        333
# 4 Canada  2020         38
# 5 Canada  2021         38
# 6 Canada  2022         39
# 7 Mexico  2020        128
# 8 Mexico  2021        130
# 9 Mexico  2022        132

Splitting Column Names with names_sep

When column names contain multiple pieces of information separated by delimiters, names_sep splits them into separate columns.

# Data with compound column names
compound_data <- data.frame(
  country = c("USA", "Canada"),
  gdp_2020_usd = c(21000, 1600),
  gdp_2021_usd = c(23000, 1700),
  gdp_2020_eur = c(18000, 1400),
  gdp_2021_eur = c(19000, 1500)
)

result <- compound_data %>%
  pivot_longer(
    cols = -country,
    names_to = c("metric", "year", "currency"),
    names_sep = "_",
    values_to = "value"
  )

print(result)
#   country metric year currency value
# 1 USA     gdp    2020 usd      21000
# 2 USA     gdp    2021 usd      23000
# 3 USA     gdp    2020 eur      18000
# 4 USA     gdp    2021 eur      19000
# 5 Canada  gdp    2020 usd       1600
# 6 Canada  gdp    2021 usd       1700
# 7 Canada  gdp    2020 eur       1400
# 8 Canada  gdp    2021 eur       1500

Pattern Matching with names_pattern

For complex naming schemes, names_pattern uses regular expressions to extract components.

# Column names with irregular patterns
irregular_data <- data.frame(
  id = 1:2,
  temp_day1_celsius = c(20, 22),
  temp_day2_celsius = c(21, 23),
  temp_day1_fahrenheit = c(68, 72),
  temp_day2_fahrenheit = c(70, 73)
)

pattern_result <- irregular_data %>%
  pivot_longer(
    cols = -id,
    names_pattern = "temp_day(.*)_(.*)",
    names_to = c("day", "unit"),
    values_to = "temperature"
  )

print(pattern_result)
#   id day unit        temperature
# 1  1 1   celsius              20
# 2  1 2   celsius              21
# 3  1 1   fahrenheit           68
# 4  1 2   fahrenheit           70
# 5  2 1   celsius              22
# 6  2 2   celsius              23
# 7  2 1   fahrenheit           72
# 8  2 2   fahrenheit           73

Multiple Value Columns

The special .value placeholder in names_to creates multiple value columns from the pivoted data.

# Data with multiple metrics per time period
multi_metric <- data.frame(
  store = c("A", "B"),
  sales_q1 = c(1000, 1500),
  profit_q1 = c(200, 300),
  sales_q2 = c(1100, 1600),
  profit_q2 = c(220, 320)
)

multi_result <- multi_metric %>%
  pivot_longer(
    cols = -store,
    names_to = c(".value", "quarter"),
    names_sep = "_"
  )

print(multi_result)
#   store quarter sales profit
# 1 A     q1       1000    200
# 2 A     q2       1100    220
# 3 B     q1       1500    300
# 4 B     q2       1600    320

Handling Missing Values

Control NA handling with values_drop_na to remove rows with missing values during the pivot operation.

sparse_data <- data.frame(
  product = c("Widget", "Gadget"),
  jan = c(100, NA),
  feb = c(NA, 200),
  mar = c(150, 250)
)

# Keep NAs
with_na <- sparse_data %>%
  pivot_longer(
    cols = -product,
    names_to = "month",
    values_to = "sales"
  )

# Drop NAs
without_na <- sparse_data %>%
  pivot_longer(
    cols = -product,
    names_to = "month",
    values_to = "sales",
    values_drop_na = TRUE
  )

print(with_na)
#   product month sales
# 1 Widget  jan     100
# 2 Widget  feb      NA
# 3 Widget  mar     150
# 4 Gadget  jan      NA
# 5 Gadget  feb     200
# 6 Gadget  mar     250

print(without_na)
#   product month sales
# 1 Widget  jan     100
# 2 Widget  mar     150
# 3 Gadget  feb     200
# 4 Gadget  mar     250

Real-World Example: Survey Data

Survey data commonly arrives in wide format with one column per question. Pivoting enables analysis across questions.

survey_wide <- data.frame(
  respondent_id = 1:3,
  age = c(25, 34, 42),
  satisfaction_product = c(4, 5, 3),
  satisfaction_service = c(5, 4, 4),
  satisfaction_price = c(3, 3, 2)
)

survey_long <- survey_wide %>%
  pivot_longer(
    cols = starts_with("satisfaction_"),
    names_to = "category",
    names_prefix = "satisfaction_",
    values_to = "rating"
  )

# Calculate average rating by category
survey_long %>%
  group_by(category) %>%
  summarize(avg_rating = mean(rating))
#   category avg_rating
# 1 price          2.67
# 2 product        4.00
# 3 service        4.33

Performance Considerations

For large datasets, specify column types explicitly and consider data.table alternatives for memory efficiency.

# Specify value types for better performance
large_data %>%
  pivot_longer(
    cols = -id,
    names_to = "variable",
    values_to = "value",
    values_transform = list(value = as.numeric)
  )

# For very large datasets, consider data.table::melt()
library(data.table)
dt <- as.data.table(wide_data)
melt(dt, id.vars = "country", variable.name = "year", value.name = "population")

The pivot_longer() function provides flexible data reshaping capabilities essential for tidy data workflows. Master the parameter combinations to handle any wide-to-long transformation efficiently.