R - Data Frames - Complete Guide | Application Architect

Key Insights

Data frames are R’s fundamental two-dimensional structure for heterogeneous data, combining the flexibility of lists with the structure of matrices, making them essential for virtually all data analysis workflows
Understanding data frame operations—from creation and indexing to manipulation and merging—eliminates the need for external packages in many scenarios and provides the foundation for mastering tidyverse tools
Performance considerations matter: data frames have copy-on-modify semantics that can impact memory usage with large datasets, while proper indexing and vectorization dramatically improve execution speed

Creating Data Frames

Data frames store tabular data with columns of potentially different types. The data.frame() function constructs them from vectors, lists, or other data frames.

# Basic creation from vectors
df <- data.frame(
  id = 1:5,
  name = c("Alice", "Bob", "Charlie", "Diana", "Eve"),
  score = c(85.5, 92.3, 78.9, 95.1, 88.7),
  passed = c(TRUE, TRUE, TRUE, TRUE, TRUE)
)

print(df)
#   id    name score passed
# 1  1   Alice  85.5   TRUE
# 2  2     Bob  92.3   TRUE
# 3  3 Charlie  78.9   TRUE
# 4  4   Diana  95.1   TRUE
# 5  5     Eve  88.7   TRUE

# Check structure
str(df)
# 'data.frame':	5 obs. of  4 variables:
#  $ id    : int  1 2 3 4 5
#  $ name  : chr  "Alice" "Bob" "Charlie" "Diana" ...
#  $ score : num  85.5 92.3 78.9 95.1 88.7
#  $ passed: logi  TRUE TRUE TRUE TRUE TRUE

Control string conversion behavior with stringsAsFactors. Prior to R 4.0.0, this defaulted to TRUE, converting character vectors to factors automatically.

# Prevent automatic factor conversion
df_no_factors <- data.frame(
  category = c("A", "B", "A", "C"),
  value = 1:4,
  stringsAsFactors = FALSE
)

# Explicit factor creation when needed
df_with_factors <- data.frame(
  category = factor(c("A", "B", "A", "C"), levels = c("A", "B", "C")),
  value = 1:4
)

Create data frames from matrices or convert between structures:

# From matrix
mat <- matrix(1:12, nrow = 4, ncol = 3)
df_from_mat <- as.data.frame(mat)
colnames(df_from_mat) <- c("x", "y", "z")

# From list
list_data <- list(
  id = 1:3,
  value = c(10, 20, 30)
)
df_from_list <- as.data.frame(list_data)

Indexing and Subsetting

Data frames support multiple indexing methods: single bracket [], double bracket [[]], and dollar sign $ notation.

df <- data.frame(
  id = 1:5,
  name = c("Alice", "Bob", "Charlie", "Diana", "Eve"),
  score = c(85.5, 92.3, 78.9, 95.1, 88.7)
)

# Single bracket returns data frame
df[1, ]        # First row
df[, 2]        # Second column (as vector by default)
df[, 2, drop = FALSE]  # Second column as data frame
df[1:3, ]      # First three rows
df[c(1, 3), c("name", "score")]  # Specific rows and columns

# Double bracket returns vector/element
df[[2]]        # Second column as vector
df[[2]][1]     # First element of second column

# Dollar sign notation (column access)
df$name        # Returns vector
df$score[df$score > 90]  # Conditional subsetting

Logical indexing enables powerful filtering:

# Filter rows by condition
high_scores <- df[df$score > 90, ]

# Multiple conditions
df[df$score > 85 & df$score < 95, ]

# Using which() for row indices
high_score_indices <- which(df$score > 90)
df[high_score_indices, ]

# Negative indexing to exclude
df[-c(1, 3), ]  # Exclude rows 1 and 3

The subset() function provides cleaner syntax:

subset(df, score > 90)
subset(df, score > 85 & score < 95, select = c(name, score))
subset(df, score > 85, select = -id)  # Exclude id column

Adding and Modifying Data

Add columns using assignment operators:

df <- data.frame(
  id = 1:5,
  score = c(85, 92, 78, 95, 88)
)

# Add single column
df$grade <- c("B", "A", "C", "A", "B")

# Add calculated column
df$score_scaled <- df$score / 100

# Add multiple columns
df[c("bonus", "penalty")] <- data.frame(
  bonus = c(5, 10, 0, 15, 5),
  penalty = c(0, 0, 5, 0, 0)
)

# Conditional column creation
df$status <- ifelse(df$score >= 90, "Excellent", "Good")

Modify existing values:

# Modify single value
df[1, "score"] <- 90

# Modify entire column
df$score <- df$score + 5

# Conditional modification
df$score[df$score > 100] <- 100

# Using transform()
df <- transform(df,
  score = score * 1.1,
  new_col = score + bonus
)

Add rows with rbind():

new_row <- data.frame(
  id = 6,
  score = 87,
  grade = "B",
  score_scaled = 0.87,
  bonus = 3,
  penalty = 0,
  status = "Good"
)

df <- rbind(df, new_row)

Merging and Joining

Combine data frames using merge operations similar to SQL joins:

# Sample data frames
students <- data.frame(
  student_id = 1:4,
  name = c("Alice", "Bob", "Charlie", "Diana")
)

scores <- data.frame(
  student_id = c(1, 2, 2, 3, 5),
  subject = c("Math", "Math", "Science", "Math", "Math"),
  score = c(85, 92, 88, 78, 95)
)

# Inner join (default)
inner_result <- merge(students, scores, by = "student_id")

# Left join (all rows from left data frame)
left_result <- merge(students, scores, by = "student_id", all.x = TRUE)

# Right join
right_result <- merge(students, scores, by = "student_id", all.y = TRUE)

# Full outer join
full_result <- merge(students, scores, by = "student_id", all = TRUE)

# Different column names
courses <- data.frame(
  id = 1:4,
  course_name = c("Math", "Science", "History", "Art")
)

merge(students, courses, by.x = "student_id", by.y = "id")

Combine data frames horizontally with cbind():

df1 <- data.frame(a = 1:3, b = 4:6)
df2 <- data.frame(c = 7:9, d = 10:12)
combined <- cbind(df1, df2)

Reshaping Data

Convert between wide and long formats:

# Wide format data
wide_df <- data.frame(
  id = 1:3,
  math = c(85, 90, 78),
  science = c(88, 92, 82),
  history = c(90, 88, 85)
)

# Wide to long (stack)
long_df <- stack(wide_df[, c("math", "science", "history")])
long_df$id <- rep(1:3, times = 3)
colnames(long_df) <- c("score", "subject", "id")

# Using reshape()
long_format <- reshape(
  wide_df,
  varying = c("math", "science", "history"),
  v.names = "score",
  timevar = "subject",
  times = c("math", "science", "history"),
  direction = "long"
)

# Long to wide
wide_format <- reshape(
  long_format,
  idvar = "id",
  timevar = "subject",
  direction = "wide"
)

Sorting and Ordering

Sort data frames by one or multiple columns:

df <- data.frame(
  name = c("Alice", "Bob", "Charlie", "Diana"),
  age = c(25, 30, 25, 28),
  score = c(85, 92, 88, 90)
)

# Sort by single column
df[order(df$score), ]  # Ascending
df[order(-df$score), ]  # Descending
df[order(df$score, decreasing = TRUE), ]  # Alternative

# Sort by multiple columns
df[order(df$age, -df$score), ]  # Age ascending, score descending

# Using with()
df[with(df, order(age, -score)), ]

Aggregation and Summary

Calculate summary statistics and perform group operations:

df <- data.frame(
  category = c("A", "B", "A", "B", "A", "B"),
  value = c(10, 15, 20, 25, 30, 35),
  count = c(1, 2, 3, 4, 5, 6)
)

# Basic summaries
summary(df)
colMeans(df[, c("value", "count")])
colSums(df[, c("value", "count")])

# Aggregate by group
aggregate(value ~ category, data = df, FUN = mean)
aggregate(value ~ category, data = df, FUN = sum)

# Multiple columns
aggregate(cbind(value, count) ~ category, data = df, FUN = mean)

# Multiple grouping variables
df$region <- c("North", "North", "South", "South", "North", "South")
aggregate(value ~ category + region, data = df, FUN = sum)

# Using by()
by(df$value, df$category, summary)

# tapply for vector output
tapply(df$value, df$category, mean)

Performance Considerations

Pre-allocate data frames when building them iteratively:

# Inefficient: growing data frame in loop
df <- data.frame()
for (i in 1:10000) {
  df <- rbind(df, data.frame(x = i, y = i^2))
}

# Efficient: pre-allocate
n <- 10000
df <- data.frame(x = integer(n), y = integer(n))
for (i in 1:n) {
  df[i, ] <- c(i, i^2)
}

# Most efficient: vectorized
df <- data.frame(
  x = 1:10000,
  y = (1:10000)^2
)

Use logical indexing efficiently:

# Create large data frame
large_df <- data.frame(
  id = 1:1000000,
  value = rnorm(1000000)
)

# Efficient filtering
system.time(result <- large_df[large_df$value > 0, ])

# Using which() can be faster for small result sets
system.time(result <- large_df[which(large_df$value > 0), ])

Data frames copy on modify. Use data.table or reference semantics for large datasets requiring frequent modifications.