R - Read from Database (DBI/RSQLite)

The DBI (Database Interface) package provides a standardized way to interact with databases in R. RSQLite implements this interface for SQLite databases, offering a zero-configuration option that...

Key Insights

  • The DBI package provides a unified interface for database operations in R, with RSQLite offering a lightweight, serverless database solution perfect for local data storage and prototyping
  • Parameterized queries using dbBind() or dbGetQuery() with parameters prevent SQL injection attacks and enable efficient query reuse for batch operations
  • Connection pooling and proper resource management with dbDisconnect() are critical for production applications to prevent connection leaks and optimize database performance

Setting Up Database Connections

The DBI (Database Interface) package provides a standardized way to interact with databases in R. RSQLite implements this interface for SQLite databases, offering a zero-configuration option that stores data in a single file.

library(DBI)
library(RSQLite)

# Create an in-memory database
con <- dbConnect(RSQLite::SQLite(), ":memory:")

# Or connect to a file-based database
con_file <- dbConnect(RSQLite::SQLite(), "mydata.db")

# Check connection status
dbIsValid(con)

For production environments, you’ll typically connect to remote databases using specific drivers:

# PostgreSQL
library(RPostgres)
con_pg <- dbConnect(
  RPostgres::Postgres(),
  host = "localhost",
  port = 5432,
  dbname = "production_db",
  user = "app_user",
  password = Sys.getenv("DB_PASSWORD")
)

# MySQL
library(RMariaDB)
con_mysql <- dbConnect(
  RMariaDB::MariaDB(),
  host = "localhost",
  dbname = "analytics",
  user = "readonly_user",
  password = Sys.getenv("MYSQL_PASSWORD")
)

Creating Sample Data

Before reading data, let’s create a sample dataset to work with:

# Create a customers table
dbExecute(con, "
  CREATE TABLE customers (
    customer_id INTEGER PRIMARY KEY,
    name TEXT NOT NULL,
    email TEXT UNIQUE,
    signup_date DATE,
    total_purchases REAL
  )
")

# Insert sample data
customers_data <- data.frame(
  customer_id = 1:5,
  name = c("Alice Johnson", "Bob Smith", "Carol White", "David Brown", "Eve Davis"),
  email = c("alice@example.com", "bob@example.com", "carol@example.com", 
            "david@example.com", "eve@example.com"),
  signup_date = as.Date(c("2023-01-15", "2023-02-20", "2023-03-10", 
                          "2023-04-05", "2023-05-12")),
  total_purchases = c(1250.50, 890.25, 2100.00, 450.75, 3200.00)
)

dbWriteTable(con, "customers", customers_data, append = TRUE)

# Create an orders table
dbExecute(con, "
  CREATE TABLE orders (
    order_id INTEGER PRIMARY KEY,
    customer_id INTEGER,
    order_date DATE,
    amount REAL,
    status TEXT,
    FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
  )
")

orders_data <- data.frame(
  order_id = 1:10,
  customer_id = c(1, 1, 2, 3, 3, 3, 4, 5, 5, 5),
  order_date = as.Date("2023-06-01") + 0:9,
  amount = c(150.00, 200.50, 890.25, 500.00, 600.00, 1000.00, 
             450.75, 1000.00, 1200.00, 1000.00),
  status = c("completed", "completed", "completed", "completed", 
             "pending", "completed", "completed", "completed", 
             "completed", "pending")
)

dbWriteTable(con, "orders", orders_data, append = TRUE)

Basic Read Operations

The dbGetQuery() function executes a SQL query and returns results as a data frame:

# Read all records
all_customers <- dbGetQuery(con, "SELECT * FROM customers")
print(all_customers)

# Read with filtering
high_value <- dbGetQuery(con, "
  SELECT name, email, total_purchases 
  FROM customers 
  WHERE total_purchases > 1000
  ORDER BY total_purchases DESC
")
print(high_value)

# Aggregate queries
summary_stats <- dbGetQuery(con, "
  SELECT 
    COUNT(*) as customer_count,
    AVG(total_purchases) as avg_purchases,
    MAX(total_purchases) as max_purchases,
    MIN(total_purchases) as min_purchases
  FROM customers
")
print(summary_stats)

For queries that don’t return data, use dbExecute():

# Update records
rows_affected <- dbExecute(con, "
  UPDATE customers 
  SET total_purchases = total_purchases * 1.1 
  WHERE signup_date < '2023-03-01'
")
print(paste("Updated", rows_affected, "rows"))

Parameterized Queries

Never concatenate user input directly into SQL queries. Use parameterized queries to prevent SQL injection:

# Unsafe - DON'T DO THIS
user_input <- "alice@example.com"
unsafe_query <- paste0("SELECT * FROM customers WHERE email = '", user_input, "'")

# Safe - use parameters
safe_query <- dbGetQuery(con, 
  "SELECT * FROM customers WHERE email = ?",
  params = list("alice@example.com")
)

# Multiple parameters
date_range_query <- dbGetQuery(con,
  "SELECT * FROM orders WHERE order_date BETWEEN ? AND ? AND status = ?",
  params = list(as.Date("2023-06-01"), as.Date("2023-06-05"), "completed")
)
print(date_range_query)

For reusable prepared statements with multiple executions:

# Prepare statement
stmt <- dbSendQuery(con, "SELECT * FROM customers WHERE customer_id = ?")

# Execute with different parameters
dbBind(stmt, list(1))
result1 <- dbFetch(stmt)
print(result1)

dbBind(stmt, list(3))
result2 <- dbFetch(stmt)
print(result2)

# Clean up
dbClearResult(stmt)

Joins and Complex Queries

Combine data from multiple tables using joins:

# Customer order history
customer_orders <- dbGetQuery(con, "
  SELECT 
    c.name,
    c.email,
    o.order_id,
    o.order_date,
    o.amount,
    o.status
  FROM customers c
  INNER JOIN orders o ON c.customer_id = o.customer_id
  WHERE o.status = 'completed'
  ORDER BY o.order_date DESC
")
print(customer_orders)

# Customer summary with aggregates
customer_summary <- dbGetQuery(con, "
  SELECT 
    c.customer_id,
    c.name,
    c.total_purchases,
    COUNT(o.order_id) as order_count,
    SUM(o.amount) as order_total,
    AVG(o.amount) as avg_order_value,
    MAX(o.order_date) as last_order_date
  FROM customers c
  LEFT JOIN orders o ON c.customer_id = o.customer_id
  GROUP BY c.customer_id, c.name, c.total_purchases
  HAVING order_count > 1
  ORDER BY order_total DESC
")
print(customer_summary)

Chunked Reading for Large Datasets

For large result sets that don’t fit in memory, fetch data in chunks:

# Send query without fetching
result <- dbSendQuery(con, "SELECT * FROM orders")

# Fetch in chunks
chunk_size <- 3
all_chunks <- list()
chunk_num <- 1

while (!dbHasCompleted(result)) {
  chunk <- dbFetch(result, n = chunk_size)
  if (nrow(chunk) > 0) {
    print(paste("Processing chunk", chunk_num, "with", nrow(chunk), "rows"))
    all_chunks[[chunk_num]] <- chunk
    chunk_num <- chunk_num + 1
  }
}

# Combine all chunks
full_dataset <- do.call(rbind, all_chunks)

# Always clear result
dbClearResult(result)

Metadata and Schema Inspection

Inspect database structure programmatically:

# List all tables
tables <- dbListTables(con)
print(tables)

# List fields in a table
fields <- dbListFields(con, "customers")
print(fields)

# Get column information
column_info <- dbGetQuery(con, "PRAGMA table_info(customers)")
print(column_info)

# Check if table exists
table_exists <- dbExistsTable(con, "customers")
print(table_exists)

# Get row count efficiently
row_count <- dbGetQuery(con, "SELECT COUNT(*) as count FROM orders")$count
print(paste("Orders table has", row_count, "rows"))

Transaction Management

Use transactions for data consistency:

# Begin transaction
dbBegin(con)

tryCatch({
  # Multiple operations
  dbExecute(con, "UPDATE customers SET total_purchases = 0 WHERE customer_id = 999")
  dbExecute(con, "DELETE FROM orders WHERE customer_id = 999")
  
  # Commit if all succeed
  dbCommit(con)
  print("Transaction committed successfully")
}, error = function(e) {
  # Rollback on error
  dbRollback(con)
  print(paste("Transaction rolled back:", e$message))
})

Connection Cleanup

Always close connections when finished:

# Close specific connection
dbDisconnect(con)

# Close all connections (useful in scripts)
lapply(dbListConnections(RSQLite::SQLite()), dbDisconnect)

# Verify closure
tryCatch({
  dbIsValid(con)
}, error = function(e) {
  print("Connection successfully closed")
})

For production applications, implement connection pooling using the pool package:

library(pool)

# Create connection pool
pool <- dbPool(
  drv = RSQLite::SQLite(),
  dbname = "mydata.db",
  minSize = 1,
  maxSize = 5
)

# Use pooled connections
result <- dbGetQuery(pool, "SELECT * FROM customers")

# Pool automatically manages connections
poolClose(pool)

This approach to database reading in R provides the foundation for building robust data pipelines, analytical applications, and reporting systems that scale from prototype to production.

Liked this? There's more.

Every week: one practical technique, explained simply, with code you can use immediately.