R - Read from Database (DBI/RSQLite)
The DBI (Database Interface) package provides a standardized way to interact with databases in R. RSQLite implements this interface for SQLite databases, offering a zero-configuration option that...
Key Insights
- The DBI package provides a unified interface for database operations in R, with RSQLite offering a lightweight, serverless database solution perfect for local data storage and prototyping
- Parameterized queries using
dbBind()ordbGetQuery()with parameters prevent SQL injection attacks and enable efficient query reuse for batch operations - Connection pooling and proper resource management with
dbDisconnect()are critical for production applications to prevent connection leaks and optimize database performance
Setting Up Database Connections
The DBI (Database Interface) package provides a standardized way to interact with databases in R. RSQLite implements this interface for SQLite databases, offering a zero-configuration option that stores data in a single file.
library(DBI)
library(RSQLite)
# Create an in-memory database
con <- dbConnect(RSQLite::SQLite(), ":memory:")
# Or connect to a file-based database
con_file <- dbConnect(RSQLite::SQLite(), "mydata.db")
# Check connection status
dbIsValid(con)
For production environments, you’ll typically connect to remote databases using specific drivers:
# PostgreSQL
library(RPostgres)
con_pg <- dbConnect(
RPostgres::Postgres(),
host = "localhost",
port = 5432,
dbname = "production_db",
user = "app_user",
password = Sys.getenv("DB_PASSWORD")
)
# MySQL
library(RMariaDB)
con_mysql <- dbConnect(
RMariaDB::MariaDB(),
host = "localhost",
dbname = "analytics",
user = "readonly_user",
password = Sys.getenv("MYSQL_PASSWORD")
)
Creating Sample Data
Before reading data, let’s create a sample dataset to work with:
# Create a customers table
dbExecute(con, "
CREATE TABLE customers (
customer_id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
email TEXT UNIQUE,
signup_date DATE,
total_purchases REAL
)
")
# Insert sample data
customers_data <- data.frame(
customer_id = 1:5,
name = c("Alice Johnson", "Bob Smith", "Carol White", "David Brown", "Eve Davis"),
email = c("alice@example.com", "bob@example.com", "carol@example.com",
"david@example.com", "eve@example.com"),
signup_date = as.Date(c("2023-01-15", "2023-02-20", "2023-03-10",
"2023-04-05", "2023-05-12")),
total_purchases = c(1250.50, 890.25, 2100.00, 450.75, 3200.00)
)
dbWriteTable(con, "customers", customers_data, append = TRUE)
# Create an orders table
dbExecute(con, "
CREATE TABLE orders (
order_id INTEGER PRIMARY KEY,
customer_id INTEGER,
order_date DATE,
amount REAL,
status TEXT,
FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
)
")
orders_data <- data.frame(
order_id = 1:10,
customer_id = c(1, 1, 2, 3, 3, 3, 4, 5, 5, 5),
order_date = as.Date("2023-06-01") + 0:9,
amount = c(150.00, 200.50, 890.25, 500.00, 600.00, 1000.00,
450.75, 1000.00, 1200.00, 1000.00),
status = c("completed", "completed", "completed", "completed",
"pending", "completed", "completed", "completed",
"completed", "pending")
)
dbWriteTable(con, "orders", orders_data, append = TRUE)
Basic Read Operations
The dbGetQuery() function executes a SQL query and returns results as a data frame:
# Read all records
all_customers <- dbGetQuery(con, "SELECT * FROM customers")
print(all_customers)
# Read with filtering
high_value <- dbGetQuery(con, "
SELECT name, email, total_purchases
FROM customers
WHERE total_purchases > 1000
ORDER BY total_purchases DESC
")
print(high_value)
# Aggregate queries
summary_stats <- dbGetQuery(con, "
SELECT
COUNT(*) as customer_count,
AVG(total_purchases) as avg_purchases,
MAX(total_purchases) as max_purchases,
MIN(total_purchases) as min_purchases
FROM customers
")
print(summary_stats)
For queries that don’t return data, use dbExecute():
# Update records
rows_affected <- dbExecute(con, "
UPDATE customers
SET total_purchases = total_purchases * 1.1
WHERE signup_date < '2023-03-01'
")
print(paste("Updated", rows_affected, "rows"))
Parameterized Queries
Never concatenate user input directly into SQL queries. Use parameterized queries to prevent SQL injection:
# Unsafe - DON'T DO THIS
user_input <- "alice@example.com"
unsafe_query <- paste0("SELECT * FROM customers WHERE email = '", user_input, "'")
# Safe - use parameters
safe_query <- dbGetQuery(con,
"SELECT * FROM customers WHERE email = ?",
params = list("alice@example.com")
)
# Multiple parameters
date_range_query <- dbGetQuery(con,
"SELECT * FROM orders WHERE order_date BETWEEN ? AND ? AND status = ?",
params = list(as.Date("2023-06-01"), as.Date("2023-06-05"), "completed")
)
print(date_range_query)
For reusable prepared statements with multiple executions:
# Prepare statement
stmt <- dbSendQuery(con, "SELECT * FROM customers WHERE customer_id = ?")
# Execute with different parameters
dbBind(stmt, list(1))
result1 <- dbFetch(stmt)
print(result1)
dbBind(stmt, list(3))
result2 <- dbFetch(stmt)
print(result2)
# Clean up
dbClearResult(stmt)
Joins and Complex Queries
Combine data from multiple tables using joins:
# Customer order history
customer_orders <- dbGetQuery(con, "
SELECT
c.name,
c.email,
o.order_id,
o.order_date,
o.amount,
o.status
FROM customers c
INNER JOIN orders o ON c.customer_id = o.customer_id
WHERE o.status = 'completed'
ORDER BY o.order_date DESC
")
print(customer_orders)
# Customer summary with aggregates
customer_summary <- dbGetQuery(con, "
SELECT
c.customer_id,
c.name,
c.total_purchases,
COUNT(o.order_id) as order_count,
SUM(o.amount) as order_total,
AVG(o.amount) as avg_order_value,
MAX(o.order_date) as last_order_date
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id, c.name, c.total_purchases
HAVING order_count > 1
ORDER BY order_total DESC
")
print(customer_summary)
Chunked Reading for Large Datasets
For large result sets that don’t fit in memory, fetch data in chunks:
# Send query without fetching
result <- dbSendQuery(con, "SELECT * FROM orders")
# Fetch in chunks
chunk_size <- 3
all_chunks <- list()
chunk_num <- 1
while (!dbHasCompleted(result)) {
chunk <- dbFetch(result, n = chunk_size)
if (nrow(chunk) > 0) {
print(paste("Processing chunk", chunk_num, "with", nrow(chunk), "rows"))
all_chunks[[chunk_num]] <- chunk
chunk_num <- chunk_num + 1
}
}
# Combine all chunks
full_dataset <- do.call(rbind, all_chunks)
# Always clear result
dbClearResult(result)
Metadata and Schema Inspection
Inspect database structure programmatically:
# List all tables
tables <- dbListTables(con)
print(tables)
# List fields in a table
fields <- dbListFields(con, "customers")
print(fields)
# Get column information
column_info <- dbGetQuery(con, "PRAGMA table_info(customers)")
print(column_info)
# Check if table exists
table_exists <- dbExistsTable(con, "customers")
print(table_exists)
# Get row count efficiently
row_count <- dbGetQuery(con, "SELECT COUNT(*) as count FROM orders")$count
print(paste("Orders table has", row_count, "rows"))
Transaction Management
Use transactions for data consistency:
# Begin transaction
dbBegin(con)
tryCatch({
# Multiple operations
dbExecute(con, "UPDATE customers SET total_purchases = 0 WHERE customer_id = 999")
dbExecute(con, "DELETE FROM orders WHERE customer_id = 999")
# Commit if all succeed
dbCommit(con)
print("Transaction committed successfully")
}, error = function(e) {
# Rollback on error
dbRollback(con)
print(paste("Transaction rolled back:", e$message))
})
Connection Cleanup
Always close connections when finished:
# Close specific connection
dbDisconnect(con)
# Close all connections (useful in scripts)
lapply(dbListConnections(RSQLite::SQLite()), dbDisconnect)
# Verify closure
tryCatch({
dbIsValid(con)
}, error = function(e) {
print("Connection successfully closed")
})
For production applications, implement connection pooling using the pool package:
library(pool)
# Create connection pool
pool <- dbPool(
drv = RSQLite::SQLite(),
dbname = "mydata.db",
minSize = 1,
maxSize = 5
)
# Use pooled connections
result <- dbGetQuery(pool, "SELECT * FROM customers")
# Pool automatically manages connections
poolClose(pool)
This approach to database reading in R provides the foundation for building robust data pipelines, analytical applications, and reporting systems that scale from prototype to production.