R purrr - keep() and discard() | Application Architect

Key Insights

• keep() and discard() filter lists and vectors using predicate functions, providing a more expressive alternative to bracket subsetting when working with complex filtering logic • These functions shine when filtering nested data structures, model outputs, or API responses where traditional subsetting becomes unwieldy • Unlike Filter() from base R, purrr’s approach integrates seamlessly with pipes and supports anonymous function shortcuts for cleaner code

Understanding keep() and discard()

The keep() and discard() functions are complementary tools for filtering elements in lists and vectors. keep() retains elements where a predicate function returns TRUE, while discard() removes them. Think of them as semantic filters that make your intent explicit.

library(purrr)

numbers <- list(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)

# Keep even numbers
keep(numbers, ~ .x %% 2 == 0)
# [[1]] 2
# [[2]] 4
# [[3]] 6
# [[4]] 8
# [[5]] 10

# Discard even numbers (keep odd)
discard(numbers, ~ .x %% 2 == 0)
# [[1]] 1
# [[2]] 3
# [[3]] 5
# [[4]] 7
# [[5]] 9

The formula syntax ~ .x creates anonymous functions on the fly. .x represents each element being tested. This is more concise than writing function(x) x %% 2 == 0.

Filtering by Type

A common use case involves filtering heterogeneous lists by data type. This is particularly useful when parsing JSON or working with mixed-type data structures.

mixed_list <- list(
  a = 1,
  b = "text",
  c = TRUE,
  d = 3.14,
  e = "another string",
  f = NULL,
  g = 42
)

# Keep only numeric values
numeric_only <- keep(mixed_list, is.numeric)
str(numeric_only)
# List of 3
#  $ a: num 1
#  $ d: num 3.14
#  $ g: num 42

# Discard character strings
no_strings <- discard(mixed_list, is.character)
str(no_strings)
# List of 4
#  $ a: num 1
#  $ c: logi TRUE
#  $ d: num 3.14
#  $ g: num 42

# Remove NULL values
clean_list <- discard(mixed_list, is.null)
length(clean_list)  # 6 instead of 7

Working with Data Frames in Lists

When you have multiple data frames in a list—common after splitting data or running multiple models—keep() and discard() help filter based on properties.

library(dplyr)

# Create sample data frames
df_list <- list(
  small = data.frame(x = 1:3, y = letters[1:3]),
  medium = data.frame(x = 1:50, y = letters[1:50]),
  large = data.frame(x = 1:200, y = rep(letters, length.out = 200)),
  tiny = data.frame(x = 1:2, y = c("a", "b"))
)

# Keep data frames with more than 10 rows
large_dfs <- keep(df_list, ~ nrow(.x) > 10)
names(large_dfs)
# [1] "medium" "large"

# Discard data frames with specific columns
has_z_column <- function(df) "z" %in% names(df)
no_z_column <- discard(df_list, has_z_column)

# Keep data frames where mean of x exceeds threshold
keep(df_list, ~ mean(.x$x) > 25)
# $medium
# ... 50 rows ...
# $large
# ... 200 rows ...

Filtering Model Results

Statistical modeling often produces lists of model objects. Filter them based on fit statistics or convergence.

# Simulate multiple linear models
set.seed(123)
models <- list(
  model1 = lm(mpg ~ wt, data = mtcars),
  model2 = lm(mpg ~ wt + hp, data = mtcars),
  model3 = lm(mpg ~ wt + hp + disp, data = mtcars),
  model4 = lm(mpg ~ wt + hp + disp + drat, data = mtcars)
)

# Keep models with R-squared above 0.8
good_models <- keep(models, ~ summary(.x)$r.squared > 0.8)
length(good_models)

# Get R-squared for verification
map_dbl(good_models, ~ summary(.x)$r.squared)

# Discard models with more than 3 predictors
simple_models <- discard(models, ~ length(coef(.x)) > 4)
map_int(simple_models, ~ length(coef(.x)))
# model1 model2 model3 
#      2      3      4

Complex Predicate Functions

For sophisticated filtering logic, define named predicate functions. This improves readability and reusability.

# Working with a list of API responses
api_responses <- list(
  response1 = list(status = 200, data = list(value = 100), timestamp = Sys.time()),
  response2 = list(status = 404, data = NULL, timestamp = Sys.time()),
  response3 = list(status = 200, data = list(value = 250), timestamp = Sys.time()),
  response4 = list(status = 500, data = NULL, timestamp = Sys.time()),
  response5 = list(status = 200, data = list(value = 75), timestamp = Sys.time())
)

# Define predicate for successful responses
is_successful <- function(response) {
  response$status == 200 && !is.null(response$data)
}

successful_responses <- keep(api_responses, is_successful)
length(successful_responses)  # 3

# Chain multiple filters
high_value_responses <- successful_responses %>%
  keep(~ .x$data$value > 80)

length(high_value_responses)  # 2

Combining with Other purrr Functions

The real power emerges when combining keep() and discard() with map(), reduce(), and other purrr functions.

# Process nested data structure
sales_data <- list(
  q1 = list(revenue = 100000, expenses = 80000, region = "North"),
  q2 = list(revenue = 150000, expenses = 90000, region = "North"),
  q3 = list(revenue = 120000, expenses = 95000, region = "South"),
  q4 = list(revenue = 180000, expenses = 100000, region = "North")
)

# Keep profitable quarters, calculate profit, sum total
total_profit <- sales_data %>%
  keep(~ .x$revenue > .x$expenses) %>%
  map_dbl(~ .x$revenue - .x$expenses) %>%
  sum()

total_profit  # 275000

# Filter and transform in pipeline
north_region_summary <- sales_data %>%
  keep(~ .x$region == "North") %>%
  map_dfr(~ data.frame(
    revenue = .x$revenue,
    expenses = .x$expenses,
    profit = .x$revenue - .x$expenses
  ))

print(north_region_summary)

Performance Considerations

While keep() and discard() are elegant, understand their performance characteristics for large datasets.

# For simple vectors, bracket subsetting is faster
large_vector <- 1:1000000

# Bracket subsetting
system.time({
  result1 <- large_vector[large_vector %% 2 == 0]
})

# purrr::keep()
system.time({
  result2 <- keep(large_vector, ~ .x %% 2 == 0)
})

# Use keep() for readability with lists/complex predicates
# Use bracket subsetting for performance-critical vector operations

Practical Example: Data Validation Pipeline

Here’s a complete example validating and cleaning a list of user records.

user_records <- list(
  list(id = 1, name = "Alice", email = "alice@example.com", age = 30),
  list(id = 2, name = "Bob", email = "invalid-email", age = 25),
  list(id = 3, name = "", email = "charlie@example.com", age = 35),
  list(id = 4, name = "Diana", email = "diana@example.com", age = -5),
  list(id = 5, name = "Eve", email = "eve@example.com", age = 28)
)

# Validation predicates
has_valid_email <- function(record) {
  grepl("^[^@]+@[^@]+\\.[^@]+$", record$email)
}

has_valid_name <- function(record) {
  nchar(record$name) > 0
}

has_valid_age <- function(record) {
  record$age > 0 && record$age < 120
}

# Clean pipeline
valid_records <- user_records %>%
  keep(has_valid_email) %>%
  keep(has_valid_name) %>%
  keep(has_valid_age)

length(valid_records)  # 2
map_chr(valid_records, "name")  # "Alice" "Eve"

# Alternative: find invalid records for logging
invalid_records <- user_records %>%
  discard(has_valid_email) %>%
  map_int("id")

invalid_records  # IDs that failed validation

The keep() and discard() functions transform list filtering from mechanical subsetting into declarative data manipulation. Use them when predicate logic is complex, when working with nested structures, or when code readability trumps marginal performance gains.