R - Add/Remove Rows in Data Frame
The most straightforward approach uses `rbind()` to bind rows together. Create a new row as a data frame or list with matching column names:
Key Insights
- R provides multiple methods to add rows including
rbind(), direct assignment with indexing, anddplyr::add_row(), each with distinct performance characteristics for different data sizes - Row removal requires understanding R’s negative indexing, logical subsetting, and the
subset()function to avoid common pitfalls like unintended data loss - Working with data frames demands attention to row names, factor levels, and column matching to prevent silent errors that corrupt your dataset
Adding Single Rows to Data Frames
The most straightforward approach uses rbind() to bind rows together. Create a new row as a data frame or list with matching column names:
# Create initial data frame
df <- data.frame(
id = 1:3,
name = c("Alice", "Bob", "Carol"),
score = c(85, 92, 78)
)
# Add single row using rbind with data frame
new_row <- data.frame(id = 4, name = "David", score = 88)
df <- rbind(df, new_row)
# Add single row using rbind with list
df <- rbind(df, list(id = 5, name = "Eve", score = 95))
print(df)
# id name score
# 1 1 Alice 85
# 2 2 Bob 92
# 3 3 Carol 78
# 4 4 David 88
# 5 5 Eve 95
Direct indexing assignment works but requires careful handling of data types:
# Add row by direct assignment
df[nrow(df) + 1, ] <- c(6, "Frank", 81)
# Warning: This coerces all columns to character if types don't match
# Better approach with explicit types
df[nrow(df) + 1, ] <- list(id = 7, name = "Grace", score = 90)
The dplyr package provides add_row() with cleaner syntax and better type handling:
library(dplyr)
df <- df %>%
add_row(id = 8, name = "Henry", score = 87)
# Add at specific position
df <- df %>%
add_row(id = 9, name = "Iris", score = 93, .before = 2)
Adding Multiple Rows Efficiently
When adding multiple rows, create a separate data frame and use rbind() once rather than repeatedly:
# Inefficient: multiple rbind calls in loop
df_base <- data.frame(id = 1:3, value = c(10, 20, 30))
for (i in 4:1000) {
df_base <- rbind(df_base, data.frame(id = i, value = i * 10))
}
# Efficient: create complete data frame, then combine
df_base <- data.frame(id = 1:3, value = c(10, 20, 30))
new_rows <- data.frame(id = 4:1000, value = (4:1000) * 10)
df_combined <- rbind(df_base, new_rows)
Use dplyr::bind_rows() for more flexible row binding that handles missing columns:
library(dplyr)
df1 <- data.frame(id = 1:2, name = c("Alice", "Bob"), age = c(25, 30))
df2 <- data.frame(id = 3:4, name = c("Carol", "David"), salary = c(50000, 60000))
# bind_rows fills missing columns with NA
df_merged <- bind_rows(df1, df2)
# id name age salary
# 1 1 Alice 25 NA
# 2 2 Bob 30 NA
# 3 3 Carol NA 50000
# 4 4 David NA 60000
Removing Rows by Index
Negative indexing removes rows by position:
df <- data.frame(
id = 1:5,
name = c("Alice", "Bob", "Carol", "David", "Eve"),
score = c(85, 92, 78, 88, 95)
)
# Remove single row
df <- df[-3, ] # Removes row 3 (Carol)
# Remove multiple rows
df <- df[-c(1, 4), ] # Removes rows 1 and 4
# Remove range of rows
df <- df[-(2:4), ] # Removes rows 2 through 4
Keep specific rows using positive indexing:
# Keep only rows 1, 3, and 5
df <- df[c(1, 3, 5), ]
# Keep first 10 rows
df <- df[1:10, ]
Removing Rows by Condition
Logical subsetting removes rows based on column values:
df <- data.frame(
id = 1:8,
name = c("Alice", "Bob", "Carol", "David", "Eve", "Frank", "Grace", "Henry"),
score = c(85, 92, 78, 88, 95, 72, 90, 84),
status = c("pass", "pass", "fail", "pass", "pass", "fail", "pass", "pass")
)
# Remove rows where score < 80
df <- df[df$score >= 80, ]
# Remove rows where status is "fail"
df <- df[df$status != "fail", ]
# Multiple conditions with AND
df <- df[df$score >= 85 & df$status == "pass", ]
# Multiple conditions with OR
df <- df[df$score >= 90 | df$id <= 2, ]
The subset() function provides cleaner syntax without explicit indexing:
# Remove rows using subset
df <- subset(df, score >= 80)
df <- subset(df, status != "fail")
df <- subset(df, score >= 85 & status == "pass")
# Select specific columns while filtering
df_filtered <- subset(df, score >= 85, select = c(name, score))
Using dplyr::filter() for complex conditions:
library(dplyr)
df <- df %>%
filter(score >= 80, status == "pass")
# Remove rows with NA values in specific column
df <- df %>%
filter(!is.na(score))
# Complex conditions with string matching
df <- df %>%
filter(score >= 85 | grepl("^A", name))
Handling Row Names and Duplicates
Row names can cause issues when adding or removing rows:
df <- data.frame(
name = c("Alice", "Bob", "Carol"),
score = c(85, 92, 78),
row.names = c("student1", "student2", "student3")
)
# Adding rows with rbind may create duplicate row names
new_row <- data.frame(name = "David", score = 88, row.names = "student1")
# Error: duplicate row names not allowed
# Reset row names before adding
rownames(new_row) <- NULL
df <- rbind(df, new_row)
# Or use automatic row numbering
df <- data.frame(
name = c("Alice", "Bob", "Carol"),
score = c(85, 92, 78)
)
rownames(df) <- NULL
Remove duplicate rows using duplicated():
df <- data.frame(
id = c(1, 2, 3, 2, 4),
name = c("Alice", "Bob", "Carol", "Bob", "David"),
score = c(85, 92, 78, 92, 88)
)
# Remove duplicate rows (keeps first occurrence)
df <- df[!duplicated(df), ]
# Remove duplicates based on specific columns
df <- df[!duplicated(df[c("id", "name")]), ]
# Using dplyr
library(dplyr)
df <- df %>% distinct()
df <- df %>% distinct(id, name, .keep_all = TRUE)
Removing Rows with Missing Values
Handle NA values strategically to maintain data integrity:
df <- data.frame(
id = 1:6,
name = c("Alice", "Bob", NA, "David", "Eve", "Frank"),
score = c(85, NA, 78, 88, 95, NA)
)
# Remove rows with any NA
df_complete <- na.omit(df)
# Remove rows with NA in specific column
df <- df[!is.na(df$score), ]
# Using dplyr
library(dplyr)
df <- df %>%
filter(!is.na(score))
# Remove rows where all values are NA
df <- df[rowSums(is.na(df)) != ncol(df), ]
# Remove rows with NA in multiple specific columns
df <- df[complete.cases(df[c("name", "score")]), ]
These techniques form the foundation for data frame manipulation in R. Choose methods based on your specific use case: rbind() for simple additions, dplyr for readable pipelines, and logical subsetting for conditional removal. Always verify column types and row names to avoid silent data corruption.