R - Read JSON File (jsonlite)
The `jsonlite` package is the de facto standard for JSON operations in R. Install it once and load it for each session:
Key Insights
- The
jsonlitepackage provides robust JSON parsing with automatic type conversion, handling nested structures more reliably than base R’srjsonorRJSONIO - Use
read_json()for list outputs andfromJSON()for automatic data frame conversion, withflatten=TRUEto normalize nested structures - Stream large JSON files with
stream_in()to avoid memory issues, processing line-delimited JSON (ndjson) incrementally
Installing and Loading jsonlite
The jsonlite package is the de facto standard for JSON operations in R. Install it once and load it for each session:
install.packages("jsonlite")
library(jsonlite)
For production environments, specify the version in your dependency management:
# renv.lock or DESCRIPTION file
jsonlite (>= 1.8.0)
Basic JSON File Reading
The fromJSON() function reads JSON files and automatically converts them to appropriate R data structures:
# Simple JSON object
data <- fromJSON("config.json")
# Accessing nested values
api_key <- data$credentials$api_key
timeout <- data$settings$timeout
Example config.json:
{
"credentials": {
"api_key": "sk-abc123",
"region": "us-east-1"
},
"settings": {
"timeout": 30,
"retry": true
}
}
For arrays, fromJSON() creates data frames when elements have consistent structure:
# Reading JSON array
users <- fromJSON("users.json")
print(class(users)) # "data.frame"
# Access columns directly
active_users <- users[users$active == TRUE, ]
Example users.json:
[
{"id": 1, "name": "Alice", "active": true, "score": 95.5},
{"id": 2, "name": "Bob", "active": false, "score": 87.2},
{"id": 3, "name": "Charlie", "active": true, "score": 92.8}
]
Handling Nested JSON Structures
Real-world JSON often contains deeply nested objects. The flatten parameter normalizes these structures:
# Without flattening
nested_data <- fromJSON("transactions.json", flatten = FALSE)
# Results in nested lists
# With flattening
flat_data <- fromJSON("transactions.json", flatten = TRUE)
# Creates columns like: user.id, user.name, transaction.amount
Example transactions.json:
[
{
"transaction_id": "tx_001",
"user": {
"id": 101,
"name": "Alice",
"tier": "premium"
},
"transaction": {
"amount": 250.00,
"currency": "USD",
"timestamp": "2024-01-15T10:30:00Z"
}
},
{
"transaction_id": "tx_002",
"user": {
"id": 102,
"name": "Bob",
"tier": "basic"
},
"transaction": {
"amount": 75.50,
"currency": "USD",
"timestamp": "2024-01-15T11:45:00Z"
}
}
]
Processing flattened data:
flat_data <- fromJSON("transactions.json", flatten = TRUE)
# Column names are automatically created
colnames(flat_data)
# [1] "transaction_id" "user.id" "user.name" "user.tier"
# [5] "transaction.amount" "transaction.currency" "transaction.timestamp"
# Filter premium users
premium_txns <- flat_data[flat_data$user.tier == "premium", ]
# Calculate total by tier
library(dplyr)
tier_summary <- flat_data %>%
group_by(user.tier) %>%
summarise(
total_amount = sum(transaction.amount),
transaction_count = n()
)
read_json vs fromJSON
The read_json() function provides more control over type conversion:
# read_json returns nested lists, no automatic simplification
list_data <- read_json("data.json")
# fromJSON attempts to create data frames
df_data <- fromJSON("data.json")
# Explicit simplification control
df_explicit <- fromJSON("data.json", simplifyVector = TRUE)
list_explicit <- fromJSON("data.json", simplifyVector = FALSE)
Use read_json() when:
- You need precise control over data structure
- JSON schema is inconsistent
- You’ll manually transform the data
Use fromJSON() when:
- JSON represents tabular data
- You want automatic data frame conversion
- Structure is consistent across records
Handling Large JSON Files
For files that exceed available memory, use stream_in() with line-delimited JSON:
# Stream processing
con <- file("large_dataset.ndjson", "r")
data <- stream_in(con, pagesize = 1000)
close(con)
# Process in chunks
process_chunk <- function(chunk) {
# Your processing logic
filtered <- chunk[chunk$value > 100, ]
return(filtered)
}
con <- file("large_dataset.ndjson", "r")
results <- list()
repeat {
chunk <- stream_in(con, pagesize = 5000, verbose = FALSE)
if (nrow(chunk) == 0) break
results[[length(results) + 1]] <- process_chunk(chunk)
}
close(con)
final_data <- do.call(rbind, results)
Example large_dataset.ndjson (newline-delimited JSON):
{"id": 1, "value": 150, "category": "A"}
{"id": 2, "value": 75, "category": "B"}
{"id": 3, "value": 200, "category": "A"}
Error Handling and Validation
Implement robust error handling for production code:
safe_read_json <- function(filepath) {
tryCatch({
data <- fromJSON(filepath)
# Validate required fields
required_cols <- c("id", "timestamp", "value")
missing_cols <- setdiff(required_cols, colnames(data))
if (length(missing_cols) > 0) {
stop(paste("Missing required columns:",
paste(missing_cols, collapse = ", ")))
}
return(data)
}, error = function(e) {
message(sprintf("Failed to read %s: %s", filepath, e$message))
return(NULL)
})
}
# Usage
data <- safe_read_json("api_response.json")
if (!is.null(data)) {
# Process data
}
Validate JSON structure before processing:
library(jsonlite)
validate_json_file <- function(filepath) {
# Check file exists
if (!file.exists(filepath)) {
return(list(valid = FALSE, error = "File not found"))
}
# Check valid JSON
tryCatch({
jsonlite::validate(readLines(filepath, warn = FALSE))
return(list(valid = TRUE, error = NULL))
}, error = function(e) {
return(list(valid = FALSE, error = e$message))
})
}
# Validate before reading
validation <- validate_json_file("data.json")
if (validation$valid) {
data <- fromJSON("data.json")
} else {
stop(paste("Invalid JSON:", validation$error))
}
Working with JSON from APIs
Reading JSON directly from HTTP endpoints:
# Direct URL reading
api_data <- fromJSON("https://api.example.com/v1/data")
# With authentication
library(httr)
response <- GET(
"https://api.example.com/v1/data",
add_headers(Authorization = paste("Bearer", api_token))
)
if (status_code(response) == 200) {
data <- fromJSON(content(response, "text", encoding = "UTF-8"))
} else {
stop(sprintf("API request failed with status %d", status_code(response)))
}
# Handling paginated responses
fetch_all_pages <- function(base_url, token) {
all_data <- list()
page <- 1
repeat {
url <- sprintf("%s?page=%d", base_url, page)
response <- GET(url, add_headers(Authorization = paste("Bearer", token)))
if (status_code(response) != 200) break
page_data <- fromJSON(content(response, "text"))
if (length(page_data$results) == 0) break
all_data[[page]] <- page_data$results
page <- page + 1
}
return(do.call(rbind, all_data))
}
Performance Optimization
For repeated JSON operations, consider these optimizations:
# Pre-allocate when possible
n_files <- length(json_files)
data_list <- vector("list", n_files)
for (i in seq_along(json_files)) {
data_list[[i]] <- fromJSON(json_files[i])
}
combined_data <- do.call(rbind, data_list)
# Parallel processing for multiple files
library(parallel)
cl <- makeCluster(detectCores() - 1)
clusterEvalQ(cl, library(jsonlite))
json_files <- list.files("data/", pattern = "\\.json$", full.names = TRUE)
data_list <- parLapply(cl, json_files, fromJSON)
stopCluster(cl)
final_data <- do.call(rbind, data_list)
The jsonlite package handles most JSON parsing scenarios efficiently. Use fromJSON() for standard cases, stream_in() for large files, and implement validation for production systems. The automatic type conversion and flattening capabilities eliminate most manual data wrangling, letting you focus on analysis rather than parsing.