R dplyr - filter() Rows by Condition

Key Insights

filter() selects rows based on logical conditions, supporting multiple criteria with AND/OR logic and working seamlessly with pipe operators for readable data transformations
String matching requires grepl() or stringr functions since filter() evaluates logical expressions, not pattern matching directly
Performance optimization comes from filtering early in pipelines, using %in% for multiple values, and combining conditions efficiently rather than chaining multiple filter() calls

Basic Filtering Syntax

The filter() function from dplyr selects rows where conditions evaluate to TRUE. Unlike base R subsetting with brackets, filter() automatically removes NA values and integrates cleanly into piped workflows.

library(dplyr)

# Sample dataset
employees <- data.frame(
  id = 1:8,
  name = c("Alice", "Bob", "Carol", "David", "Eve", "Frank", "Grace", "Henry"),
  department = c("Sales", "IT", "Sales", "IT", "HR", "IT", "Sales", "HR"),
  salary = c(65000, 72000, 58000, 81000, 69000, 75000, 62000, 71000),
  years_experience = c(3, 5, 2, 8, 4, 6, 3, 5)
)

# Single condition
high_earners <- employees %>%
  filter(salary > 70000)

print(high_earners)
#   id   name department salary years_experience
# 1  2    Bob         IT  72000                5
# 2  4  David         IT  81000                8
# 3  6  Frank         IT  75000                6
# 4  8  Henry         HR  71000                5

Multiple Conditions with AND Logic

Combine conditions with commas or the & operator. Each condition must evaluate to TRUE for a row to be included.

# IT department with high salaries (comma-separated)
it_high_earners <- employees %>%
  filter(department == "IT", salary > 70000)

# Equivalent using & operator
it_high_earners_alt <- employees %>%
  filter(department == "IT" & salary > 70000)

print(it_high_earners)
#   id   name department salary years_experience
# 1  2    Bob         IT  72000                5
# 2  4  David         IT  81000                8
# 3  6  Frank         IT  75000                6

# Range filtering
mid_career <- employees %>%
  filter(years_experience >= 4, years_experience <= 6)

print(mid_career)
#   id   name department salary years_experience
# 1  2    Bob         IT  72000                5
# 2  5    Eve         HR  69000                4
# 3  6  Frank         IT  75000                6
# 4  8  Henry         HR  71000                5

OR Conditions and Complex Logic

Use the | operator for OR logic. Parentheses control evaluation order when mixing AND/OR conditions.

# Sales OR high salary
sales_or_high <- employees %>%
  filter(department == "Sales" | salary > 70000)

print(sales_or_high)
#   id   name department salary years_experience
# 1  1  Alice      Sales  65000                3
# 2  2    Bob         IT  72000                5
# 3  3  Carol      Sales  58000                2
# 4  4  David         IT  81000                8
# 5  6  Frank         IT  75000                6
# 6  7  Grace      Sales  62000                3
# 7  8  Henry         HR  71000                5

# Complex: (Sales with experience > 2) OR (IT with salary > 80000)
complex_filter <- employees %>%
  filter((department == "Sales" & years_experience > 2) | 
         (department == "IT" & salary > 80000))

print(complex_filter)
#   id   name department salary years_experience
# 1  1  Alice      Sales  65000                3
# 2  4  David         IT  81000                8
# 3  7  Grace      Sales  62000                3

Filtering with %in% for Multiple Values

The %in% operator efficiently checks membership in a vector, replacing multiple OR conditions.

# Multiple departments - inefficient
dept_filter_bad <- employees %>%
  filter(department == "IT" | department == "HR")

# Multiple departments - efficient
dept_filter_good <- employees %>%
  filter(department %in% c("IT", "HR"))

print(dept_filter_good)
#   id   name department salary years_experience
# 1  2    Bob         IT  72000                5
# 2  4  David         IT  81000                8
# 3  5    Eve         HR  69000                4
# 4  6  Frank         IT  75000                6
# 5  8  Henry         HR  71000                5

# Exclude values with negation
not_sales <- employees %>%
  filter(!department %in% c("Sales"))

# Filter by ID list
target_ids <- c(2, 4, 6, 8)
selected_employees <- employees %>%
  filter(id %in% target_ids)

String Pattern Matching

Filter text columns using grepl() for pattern matching or stringr functions for advanced operations.

library(stringr)

# Names containing 'a' (case-insensitive)
names_with_a <- employees %>%
  filter(grepl("a", name, ignore.case = TRUE))

print(names_with_a)
#   id   name department salary years_experience
# 1  1  Alice      Sales  65000                3
# 2  3  Carol      Sales  58000                2
# 3  4  David         IT  81000                8
# 4  6  Frank         IT  75000                6
# 5  7  Grace      Sales  62000                3

# Using stringr for more complex patterns
starts_with_c_or_g <- employees %>%
  filter(str_detect(name, "^[CG]"))

print(starts_with_c_or_g)
#   id   name department salary years_experience
# 1  3  Carol      Sales  58000                2
# 2  7  Grace      Sales  62000                3

# Multiple pattern matching
sales_or_hr_dept <- employees %>%
  filter(str_detect(department, "Sales|HR"))

Handling Missing Values

Filter() automatically excludes NA values. Use is.na() or !is.na() for explicit NA handling.

# Dataset with missing values
employees_na <- employees
employees_na$salary[c(2, 5)] <- NA

# Automatic NA removal
has_salary <- employees_na %>%
  filter(salary > 65000)

print(has_salary)
#   id   name department salary years_experience
# 1  4  David         IT  81000                8
# 2  6  Frank         IT  75000                6
# 3  8  Henry         HR  71000                5

# Explicit NA filtering
only_na_salary <- employees_na %>%
  filter(is.na(salary))

print(only_na_salary)
#   id name department salary years_experience
# 1  2  Bob         IT     NA                5
# 2  5  Eve         HR     NA                4

# Keep rows with complete data
complete_records <- employees_na %>%
  filter(!is.na(salary), !is.na(department))

Using filter() with Helper Functions

Combine filter() with dplyr helper functions like between(), near(), and if_any()/if_all() for advanced filtering.

# between() for range checks
mid_salary <- employees %>%
  filter(between(salary, 60000, 75000))

print(mid_salary)
#   id   name department salary years_experience
# 1  1  Alice      Sales  65000                3
# 2  2    Bob         IT  72000                5
# 3  5    Eve         HR  69000                4
# 4  6  Frank         IT  75000                6
# 5  7  Grace      Sales  62000                3
# 6  8  Henry         HR  71000                5

# near() for floating-point comparisons
target_salary <- 72000
near_target <- employees %>%
  filter(near(salary, target_salary, tol = 5000))

# if_any() - match any condition across columns (dplyr 1.0.4+)
library(tidyselect)

high_values <- employees %>%
  filter(if_any(c(salary, years_experience), ~ . > 7))

print(high_values)
#   id   name department salary years_experience
# 1  2    Bob         IT  72000                5
# 2  4  David         IT  81000                8
# 3  5    Eve         HR  69000                4
# 4  6  Frank         IT  75000                6
# 5  8  Henry         HR  71000                5

Performance Considerations

Position filter() early in pipelines to reduce data volume for subsequent operations. Combine conditions in a single filter() call rather than chaining.

# Inefficient - multiple filter calls
result_slow <- employees %>%
  filter(department == "IT") %>%
  filter(salary > 70000) %>%
  filter(years_experience >= 5)

# Efficient - single filter call
result_fast <- employees %>%
  filter(department == "IT", salary > 70000, years_experience >= 5)

# Pre-compute complex conditions for readability
employees %>%
  mutate(is_senior = years_experience >= 5 & salary > 70000) %>%
  filter(is_senior, department == "IT")

# Use %in% instead of multiple OR conditions
# Bad: filter(id == 1 | id == 2 | id == 3 | id == 4)
# Good: filter(id %in% 1:4)

Filtering Grouped Data

Filter() respects grouping from group_by(), enabling per-group filtering operations.

# Filter to above-median salary within each department
above_dept_median <- employees %>%
  group_by(department) %>%
  filter(salary > median(salary)) %>%
  ungroup()

print(above_dept_median)
#   id   name department salary years_experience
# 1  1  Alice      Sales  65000                3
# 2  2    Bob         IT  72000                5
# 3  4  David         IT  81000                8
# 4  6  Frank         IT  75000                6
# 5  7  Grace      Sales  62000                3
# 6  8  Henry         HR  71000                5

# Keep only departments with more than 2 employees
large_departments <- employees %>%
  group_by(department) %>%
  filter(n() > 2) %>%
  ungroup()

print(large_departments)
#   id   name department salary years_experience
# 1  1  Alice      Sales  65000                3
# 2  2    Bob         IT  72000                5
# 3  3  Carol      Sales  58000                2
# 4  4  David         IT  81000                8
# 5  6  Frank         IT  75000                6
# 6  7  Grace      Sales  62000                3