R - Add/Remove Rows in Data Frame | Application Architect

Key Insights

R provides multiple methods to add rows including rbind(), direct assignment with indexing, and dplyr::add_row(), each with distinct performance characteristics for different data sizes
Row removal requires understanding R’s negative indexing, logical subsetting, and the subset() function to avoid common pitfalls like unintended data loss
Working with data frames demands attention to row names, factor levels, and column matching to prevent silent errors that corrupt your dataset

Adding Single Rows to Data Frames

The most straightforward approach uses rbind() to bind rows together. Create a new row as a data frame or list with matching column names:

# Create initial data frame
df <- data.frame(
  id = 1:3,
  name = c("Alice", "Bob", "Carol"),
  score = c(85, 92, 78)
)

# Add single row using rbind with data frame
new_row <- data.frame(id = 4, name = "David", score = 88)
df <- rbind(df, new_row)

# Add single row using rbind with list
df <- rbind(df, list(id = 5, name = "Eve", score = 95))

print(df)
#   id  name score
# 1  1 Alice    85
# 2  2   Bob    92
# 3  3 Carol    78
# 4  4 David    88
# 5  5   Eve    95

Direct indexing assignment works but requires careful handling of data types:

# Add row by direct assignment
df[nrow(df) + 1, ] <- c(6, "Frank", 81)

# Warning: This coerces all columns to character if types don't match
# Better approach with explicit types
df[nrow(df) + 1, ] <- list(id = 7, name = "Grace", score = 90)

The dplyr package provides add_row() with cleaner syntax and better type handling:

library(dplyr)

df <- df %>%
  add_row(id = 8, name = "Henry", score = 87)

# Add at specific position
df <- df %>%
  add_row(id = 9, name = "Iris", score = 93, .before = 2)

Adding Multiple Rows Efficiently

When adding multiple rows, create a separate data frame and use rbind() once rather than repeatedly:

# Inefficient: multiple rbind calls in loop
df_base <- data.frame(id = 1:3, value = c(10, 20, 30))
for (i in 4:1000) {
  df_base <- rbind(df_base, data.frame(id = i, value = i * 10))
}

# Efficient: create complete data frame, then combine
df_base <- data.frame(id = 1:3, value = c(10, 20, 30))
new_rows <- data.frame(id = 4:1000, value = (4:1000) * 10)
df_combined <- rbind(df_base, new_rows)

Use dplyr::bind_rows() for more flexible row binding that handles missing columns:

library(dplyr)

df1 <- data.frame(id = 1:2, name = c("Alice", "Bob"), age = c(25, 30))
df2 <- data.frame(id = 3:4, name = c("Carol", "David"), salary = c(50000, 60000))

# bind_rows fills missing columns with NA
df_merged <- bind_rows(df1, df2)
#   id  name age salary
# 1  1 Alice  25     NA
# 2  2   Bob  30     NA
# 3  3 Carol  NA  50000
# 4  4 David  NA  60000

Removing Rows by Index

Negative indexing removes rows by position:

df <- data.frame(
  id = 1:5,
  name = c("Alice", "Bob", "Carol", "David", "Eve"),
  score = c(85, 92, 78, 88, 95)
)

# Remove single row
df <- df[-3, ]  # Removes row 3 (Carol)

# Remove multiple rows
df <- df[-c(1, 4), ]  # Removes rows 1 and 4

# Remove range of rows
df <- df[-(2:4), ]  # Removes rows 2 through 4

Keep specific rows using positive indexing:

# Keep only rows 1, 3, and 5
df <- df[c(1, 3, 5), ]

# Keep first 10 rows
df <- df[1:10, ]

Removing Rows by Condition

Logical subsetting removes rows based on column values:

df <- data.frame(
  id = 1:8,
  name = c("Alice", "Bob", "Carol", "David", "Eve", "Frank", "Grace", "Henry"),
  score = c(85, 92, 78, 88, 95, 72, 90, 84),
  status = c("pass", "pass", "fail", "pass", "pass", "fail", "pass", "pass")
)

# Remove rows where score < 80
df <- df[df$score >= 80, ]

# Remove rows where status is "fail"
df <- df[df$status != "fail", ]

# Multiple conditions with AND
df <- df[df$score >= 85 & df$status == "pass", ]

# Multiple conditions with OR
df <- df[df$score >= 90 | df$id <= 2, ]

The subset() function provides cleaner syntax without explicit indexing:

# Remove rows using subset
df <- subset(df, score >= 80)
df <- subset(df, status != "fail")
df <- subset(df, score >= 85 & status == "pass")

# Select specific columns while filtering
df_filtered <- subset(df, score >= 85, select = c(name, score))

Using dplyr::filter() for complex conditions:

library(dplyr)

df <- df %>%
  filter(score >= 80, status == "pass")

# Remove rows with NA values in specific column
df <- df %>%
  filter(!is.na(score))

# Complex conditions with string matching
df <- df %>%
  filter(score >= 85 | grepl("^A", name))

Handling Row Names and Duplicates

Row names can cause issues when adding or removing rows:

df <- data.frame(
  name = c("Alice", "Bob", "Carol"),
  score = c(85, 92, 78),
  row.names = c("student1", "student2", "student3")
)

# Adding rows with rbind may create duplicate row names
new_row <- data.frame(name = "David", score = 88, row.names = "student1")
# Error: duplicate row names not allowed

# Reset row names before adding
rownames(new_row) <- NULL
df <- rbind(df, new_row)

# Or use automatic row numbering
df <- data.frame(
  name = c("Alice", "Bob", "Carol"),
  score = c(85, 92, 78)
)
rownames(df) <- NULL

Remove duplicate rows using duplicated():

df <- data.frame(
  id = c(1, 2, 3, 2, 4),
  name = c("Alice", "Bob", "Carol", "Bob", "David"),
  score = c(85, 92, 78, 92, 88)
)

# Remove duplicate rows (keeps first occurrence)
df <- df[!duplicated(df), ]

# Remove duplicates based on specific columns
df <- df[!duplicated(df[c("id", "name")]), ]

# Using dplyr
library(dplyr)
df <- df %>% distinct()
df <- df %>% distinct(id, name, .keep_all = TRUE)

Removing Rows with Missing Values

Handle NA values strategically to maintain data integrity:

df <- data.frame(
  id = 1:6,
  name = c("Alice", "Bob", NA, "David", "Eve", "Frank"),
  score = c(85, NA, 78, 88, 95, NA)
)

# Remove rows with any NA
df_complete <- na.omit(df)

# Remove rows with NA in specific column
df <- df[!is.na(df$score), ]

# Using dplyr
library(dplyr)
df <- df %>%
  filter(!is.na(score))

# Remove rows where all values are NA
df <- df[rowSums(is.na(df)) != ncol(df), ]

# Remove rows with NA in multiple specific columns
df <- df[complete.cases(df[c("name", "score")]), ]

These techniques form the foundation for data frame manipulation in R. Choose methods based on your specific use case: rbind() for simple additions, dplyr for readable pipelines, and logical subsetting for conditional removal. Always verify column types and row names to avoid silent data corruption.