R - Read JSON File (jsonlite) | Application Architect

Key Insights

The jsonlite package provides robust JSON parsing with automatic type conversion, handling nested structures more reliably than base R’s rjson or RJSONIO
Use read_json() for list outputs and fromJSON() for automatic data frame conversion, with flatten=TRUE to normalize nested structures
Stream large JSON files with stream_in() to avoid memory issues, processing line-delimited JSON (ndjson) incrementally

Installing and Loading jsonlite

The jsonlite package is the de facto standard for JSON operations in R. Install it once and load it for each session:

install.packages("jsonlite")
library(jsonlite)

For production environments, specify the version in your dependency management:

# renv.lock or DESCRIPTION file
jsonlite (>= 1.8.0)

Basic JSON File Reading

The fromJSON() function reads JSON files and automatically converts them to appropriate R data structures:

# Simple JSON object
data <- fromJSON("config.json")

# Accessing nested values
api_key <- data$credentials$api_key
timeout <- data$settings$timeout

Example config.json:

{
  "credentials": {
    "api_key": "sk-abc123",
    "region": "us-east-1"
  },
  "settings": {
    "timeout": 30,
    "retry": true
  }
}

For arrays, fromJSON() creates data frames when elements have consistent structure:

# Reading JSON array
users <- fromJSON("users.json")
print(class(users))  # "data.frame"

# Access columns directly
active_users <- users[users$active == TRUE, ]

Example users.json:

[
  {"id": 1, "name": "Alice", "active": true, "score": 95.5},
  {"id": 2, "name": "Bob", "active": false, "score": 87.2},
  {"id": 3, "name": "Charlie", "active": true, "score": 92.8}
]

Handling Nested JSON Structures

Real-world JSON often contains deeply nested objects. The flatten parameter normalizes these structures:

# Without flattening
nested_data <- fromJSON("transactions.json", flatten = FALSE)
# Results in nested lists

# With flattening
flat_data <- fromJSON("transactions.json", flatten = TRUE)
# Creates columns like: user.id, user.name, transaction.amount

Example transactions.json:

[
  {
    "transaction_id": "tx_001",
    "user": {
      "id": 101,
      "name": "Alice",
      "tier": "premium"
    },
    "transaction": {
      "amount": 250.00,
      "currency": "USD",
      "timestamp": "2024-01-15T10:30:00Z"
    }
  },
  {
    "transaction_id": "tx_002",
    "user": {
      "id": 102,
      "name": "Bob",
      "tier": "basic"
    },
    "transaction": {
      "amount": 75.50,
      "currency": "USD",
      "timestamp": "2024-01-15T11:45:00Z"
    }
  }
]

Processing flattened data:

flat_data <- fromJSON("transactions.json", flatten = TRUE)

# Column names are automatically created
colnames(flat_data)
# [1] "transaction_id" "user.id" "user.name" "user.tier" 
# [5] "transaction.amount" "transaction.currency" "transaction.timestamp"

# Filter premium users
premium_txns <- flat_data[flat_data$user.tier == "premium", ]

# Calculate total by tier
library(dplyr)
tier_summary <- flat_data %>%
  group_by(user.tier) %>%
  summarise(
    total_amount = sum(transaction.amount),
    transaction_count = n()
  )

read_json vs fromJSON

The read_json() function provides more control over type conversion:

# read_json returns nested lists, no automatic simplification
list_data <- read_json("data.json")

# fromJSON attempts to create data frames
df_data <- fromJSON("data.json")

# Explicit simplification control
df_explicit <- fromJSON("data.json", simplifyVector = TRUE)
list_explicit <- fromJSON("data.json", simplifyVector = FALSE)

Use read_json() when:

You need precise control over data structure
JSON schema is inconsistent
You’ll manually transform the data

Use fromJSON() when:

JSON represents tabular data
You want automatic data frame conversion
Structure is consistent across records

Handling Large JSON Files

For files that exceed available memory, use stream_in() with line-delimited JSON:

# Stream processing
con <- file("large_dataset.ndjson", "r")
data <- stream_in(con, pagesize = 1000)
close(con)

# Process in chunks
process_chunk <- function(chunk) {
  # Your processing logic
  filtered <- chunk[chunk$value > 100, ]
  return(filtered)
}

con <- file("large_dataset.ndjson", "r")
results <- list()
repeat {
  chunk <- stream_in(con, pagesize = 5000, verbose = FALSE)
  if (nrow(chunk) == 0) break
  results[[length(results) + 1]] <- process_chunk(chunk)
}
close(con)

final_data <- do.call(rbind, results)

Example large_dataset.ndjson (newline-delimited JSON):

{"id": 1, "value": 150, "category": "A"}
{"id": 2, "value": 75, "category": "B"}
{"id": 3, "value": 200, "category": "A"}

Error Handling and Validation

Implement robust error handling for production code:

safe_read_json <- function(filepath) {
  tryCatch({
    data <- fromJSON(filepath)
    
    # Validate required fields
    required_cols <- c("id", "timestamp", "value")
    missing_cols <- setdiff(required_cols, colnames(data))
    
    if (length(missing_cols) > 0) {
      stop(paste("Missing required columns:", 
                 paste(missing_cols, collapse = ", ")))
    }
    
    return(data)
    
  }, error = function(e) {
    message(sprintf("Failed to read %s: %s", filepath, e$message))
    return(NULL)
  })
}

# Usage
data <- safe_read_json("api_response.json")
if (!is.null(data)) {
  # Process data
}

Validate JSON structure before processing:

library(jsonlite)

validate_json_file <- function(filepath) {
  # Check file exists
  if (!file.exists(filepath)) {
    return(list(valid = FALSE, error = "File not found"))
  }
  
  # Check valid JSON
  tryCatch({
    jsonlite::validate(readLines(filepath, warn = FALSE))
    return(list(valid = TRUE, error = NULL))
  }, error = function(e) {
    return(list(valid = FALSE, error = e$message))
  })
}

# Validate before reading
validation <- validate_json_file("data.json")
if (validation$valid) {
  data <- fromJSON("data.json")
} else {
  stop(paste("Invalid JSON:", validation$error))
}

Working with JSON from APIs

Reading JSON directly from HTTP endpoints:

# Direct URL reading
api_data <- fromJSON("https://api.example.com/v1/data")

# With authentication
library(httr)

response <- GET(
  "https://api.example.com/v1/data",
  add_headers(Authorization = paste("Bearer", api_token))
)

if (status_code(response) == 200) {
  data <- fromJSON(content(response, "text", encoding = "UTF-8"))
} else {
  stop(sprintf("API request failed with status %d", status_code(response)))
}

# Handling paginated responses
fetch_all_pages <- function(base_url, token) {
  all_data <- list()
  page <- 1
  
  repeat {
    url <- sprintf("%s?page=%d", base_url, page)
    response <- GET(url, add_headers(Authorization = paste("Bearer", token)))
    
    if (status_code(response) != 200) break
    
    page_data <- fromJSON(content(response, "text"))
    if (length(page_data$results) == 0) break
    
    all_data[[page]] <- page_data$results
    page <- page + 1
  }
  
  return(do.call(rbind, all_data))
}

Performance Optimization

For repeated JSON operations, consider these optimizations:

# Pre-allocate when possible
n_files <- length(json_files)
data_list <- vector("list", n_files)

for (i in seq_along(json_files)) {
  data_list[[i]] <- fromJSON(json_files[i])
}

combined_data <- do.call(rbind, data_list)

# Parallel processing for multiple files
library(parallel)

cl <- makeCluster(detectCores() - 1)
clusterEvalQ(cl, library(jsonlite))

json_files <- list.files("data/", pattern = "\\.json$", full.names = TRUE)
data_list <- parLapply(cl, json_files, fromJSON)
stopCluster(cl)

final_data <- do.call(rbind, data_list)

The jsonlite package handles most JSON parsing scenarios efficiently. Use fromJSON() for standard cases, stream_in() for large files, and implement validation for production systems. The automatic type conversion and flattening capabilities eliminate most manual data wrangling, letting you focus on analysis rather than parsing.