R - reshape() - Wide to Long and Back

• The `reshape()` function transforms data between wide format (multiple columns per subject) and long format (one row per observation) without external packages

Key Insights

• The reshape() function transforms data between wide format (multiple columns per subject) and long format (one row per observation) without external packages • Direction is controlled by the direction parameter: “wide” creates multiple columns per ID, “long” stacks observations into rows • Proper specification of idvar, timevar, and varying parameters is critical for successful reshaping operations

Understanding Wide vs Long Format

Wide format stores repeated measurements as separate columns. Each subject occupies one row, with multiple columns representing different time points or conditions. Long format stacks all measurements vertically, with one row per observation and additional columns identifying the subject and measurement type.

# Wide format example
wide_data <- data.frame(
  id = c(1, 2, 3),
  name = c("Alice", "Bob", "Carol"),
  score_t1 = c(85, 92, 78),
  score_t2 = c(88, 89, 82),
  score_t3 = c(90, 95, 85)
)

print(wide_data)
#   id  name score_t1 score_t2 score_t3
# 1  1 Alice       85       88       90
# 2  2   Bob       92       89       95
# 3  3 Carol       78       82       85

Converting Wide to Long Format

The reshape() function requires three essential parameters for wide-to-long conversion: varying (columns to stack), v.names (name for the stacked values), and timevar (name for the time/condition identifier).

# Basic wide to long conversion
long_data <- reshape(
  wide_data,
  direction = "long",
  varying = c("score_t1", "score_t2", "score_t3"),
  v.names = "score",
  timevar = "time",
  times = c("t1", "t2", "t3"),
  idvar = "id"
)

# Sort for readability
long_data <- long_data[order(long_data$id, long_data$time), ]

print(long_data)
#     id  name time score
# 1.1  1 Alice   t1    85
# 1.2  1 Alice   t2    88
# 1.3  1 Alice   t3    90
# 2.1  2   Bob   t1    92
# 2.2  2   Bob   t2    89
# 2.3  2   Bob   t3    95
# 3.1  3 Carol   t1    78
# 3.2  3 Carol   t2    82
# 3.3  3 Carol   t3    85

The times parameter explicitly labels each time point. Without it, reshape() uses numeric indices (1, 2, 3).

Converting Long to Wide Format

Reversing the transformation requires specifying the timevar (which column contains time identifiers) and idvar (which column uniquely identifies subjects). The v.names parameter indicates which column contains the values to spread.

# Long to wide conversion
wide_again <- reshape(
  long_data,
  direction = "wide",
  v.names = "score",
  timevar = "time",
  idvar = c("id", "name")
)

print(wide_again)
#   id  name score.t1 score.t2 score.t3
# 1  1 Alice       85       88       90
# 2  2   Bob       92       89       95
# 3  3 Carol       78       82       85

Note that column names in the wide format use dot notation (score.t1) rather than underscore. Control this with the sep parameter.

# Custom separator
wide_custom <- reshape(
  long_data,
  direction = "wide",
  v.names = "score",
  timevar = "time",
  idvar = c("id", "name"),
  sep = "_"
)

print(names(wide_custom))
# [1] "id"       "name"     "score_t1" "score_t2" "score_t3"

Reshaping Multiple Variables Simultaneously

Real datasets often contain multiple measurements per time point. The varying parameter accepts a list structure to handle multiple variables.

# Wide data with multiple measurements
multi_wide <- data.frame(
  id = c(1, 2, 3),
  score_t1 = c(85, 92, 78),
  score_t2 = c(88, 89, 82),
  hours_t1 = c(5, 6, 4),
  hours_t2 = c(6, 5, 5)
)

# Reshape with multiple variables
multi_long <- reshape(
  multi_wide,
  direction = "long",
  varying = list(
    score = c("score_t1", "score_t2"),
    hours = c("hours_t1", "hours_t2")
  ),
  v.names = c("score", "hours"),
  timevar = "time",
  times = c("t1", "t2"),
  idvar = "id"
)

multi_long <- multi_long[order(multi_long$id, multi_long$time), ]

print(multi_long)
#     id time score hours
# 1.1  1   t1    85     5
# 1.2  1   t2    88     6
# 2.1  2   t1    92     6
# 2.2  2   t2    89     5
# 3.1  3   t1    78     4
# 3.2  3   t2    82     5

Automatic Column Detection

When column names follow consistent patterns, reshape() can automatically detect varying columns. This reduces code verbosity for datasets with many time points.

# Data with consistent naming pattern
auto_wide <- data.frame(
  id = 1:2,
  treatment = c("A", "B"),
  measure.1 = c(10, 15),
  measure.2 = c(12, 17),
  measure.3 = c(14, 19)
)

# Automatic detection using varying parameter
auto_long <- reshape(
  auto_wide,
  direction = "long",
  varying = 3:5,  # Column positions
  v.names = "measure",
  timevar = "period",
  idvar = "id"
)

print(auto_long)
#     id treatment period measure
# 1.1  1         A      1      10
# 2.1  2         B      1      15
# 1.2  1         A      2      12
# 2.2  2         B      2      17
# 1.3  1         A      3      14
# 2.3  2         B      3      19

Handling Missing Values

reshape() preserves missing values during transformation. The drop parameter controls whether to remove rows with all missing values in the varying columns.

# Data with missing values
missing_wide <- data.frame(
  id = c(1, 2, 3),
  val_t1 = c(10, NA, 15),
  val_t2 = c(12, 20, NA),
  val_t3 = c(NA, 22, 18)
)

# Default behavior keeps all rows
missing_long <- reshape(
  missing_wide,
  direction = "long",
  varying = c("val_t1", "val_t2", "val_t3"),
  v.names = "value",
  timevar = "time",
  idvar = "id"
)

print(missing_long)
#     id time value
# 1.1  1    1    10
# 2.1  2    1    NA
# 3.1  3    1    15
# 1.2  1    2    12
# 2.2  2    2    20
# 3.2  3    2    NA
# 1.3  1    3    NA
# 2.3  2    3    22
# 3.3  3    3    18

# Remove rows with missing values
missing_long_clean <- na.omit(missing_long)
print(nrow(missing_long_clean))
# [1] 6

Complex ID Variables

When multiple columns are required to uniquely identify observations, include all identifying columns in the idvar parameter. This commonly occurs with hierarchical or grouped data.

# Data with composite keys
complex_wide <- data.frame(
  site = c("A", "A", "B", "B"),
  subject = c(1, 2, 1, 2),
  gender = c("M", "F", "M", "F"),
  bp_baseline = c(120, 115, 130, 125),
  bp_followup = c(118, 113, 128, 122)
)

complex_long <- reshape(
  complex_wide,
  direction = "long",
  varying = c("bp_baseline", "bp_followup"),
  v.names = "blood_pressure",
  timevar = "visit",
  times = c("baseline", "followup"),
  idvar = c("site", "subject")
)

print(complex_long)
#       site subject gender visit blood_pressure
# A.1.1    A       1      M baseline            120
# A.2.1    A       2      F baseline            115
# B.1.1    B       1      M baseline            130
# B.2.1    B       2      F baseline            125
# A.1.2    A       1      M followup            118
# A.2.2    A       2      F followup            113
# B.1.2    B       1      M followup            128
# B.2.2    B       2      F followup            122

Row Name Management

The reshape() function creates row names combining ID and time values. Clean these with row.names() for production code or when exporting data.

# Reset row names
long_clean <- long_data
row.names(long_clean) <- NULL

print(head(long_clean, 3))
#   id  name time score
# 1  1 Alice   t1    85
# 2  1 Alice   t2    88
# 3  1 Alice   t3    90

The base R reshape() function provides robust data transformation capabilities without dependencies. While packages like tidyr offer more intuitive syntax, reshape() remains valuable for production environments requiring minimal package dependencies and for understanding fundamental data manipulation concepts.

Liked this? There's more.

Every week: one practical technique, explained simply, with code you can use immediately.