R - Read JSON File (jsonlite)

The `jsonlite` package is the de facto standard for JSON operations in R. Install it once and load it for each session:

Key Insights

  • The jsonlite package provides robust JSON parsing with automatic type conversion, handling nested structures more reliably than base R’s rjson or RJSONIO
  • Use read_json() for list outputs and fromJSON() for automatic data frame conversion, with flatten=TRUE to normalize nested structures
  • Stream large JSON files with stream_in() to avoid memory issues, processing line-delimited JSON (ndjson) incrementally

Installing and Loading jsonlite

The jsonlite package is the de facto standard for JSON operations in R. Install it once and load it for each session:

install.packages("jsonlite")
library(jsonlite)

For production environments, specify the version in your dependency management:

# renv.lock or DESCRIPTION file
jsonlite (>= 1.8.0)

Basic JSON File Reading

The fromJSON() function reads JSON files and automatically converts them to appropriate R data structures:

# Simple JSON object
data <- fromJSON("config.json")

# Accessing nested values
api_key <- data$credentials$api_key
timeout <- data$settings$timeout

Example config.json:

{
  "credentials": {
    "api_key": "sk-abc123",
    "region": "us-east-1"
  },
  "settings": {
    "timeout": 30,
    "retry": true
  }
}

For arrays, fromJSON() creates data frames when elements have consistent structure:

# Reading JSON array
users <- fromJSON("users.json")
print(class(users))  # "data.frame"

# Access columns directly
active_users <- users[users$active == TRUE, ]

Example users.json:

[
  {"id": 1, "name": "Alice", "active": true, "score": 95.5},
  {"id": 2, "name": "Bob", "active": false, "score": 87.2},
  {"id": 3, "name": "Charlie", "active": true, "score": 92.8}
]

Handling Nested JSON Structures

Real-world JSON often contains deeply nested objects. The flatten parameter normalizes these structures:

# Without flattening
nested_data <- fromJSON("transactions.json", flatten = FALSE)
# Results in nested lists

# With flattening
flat_data <- fromJSON("transactions.json", flatten = TRUE)
# Creates columns like: user.id, user.name, transaction.amount

Example transactions.json:

[
  {
    "transaction_id": "tx_001",
    "user": {
      "id": 101,
      "name": "Alice",
      "tier": "premium"
    },
    "transaction": {
      "amount": 250.00,
      "currency": "USD",
      "timestamp": "2024-01-15T10:30:00Z"
    }
  },
  {
    "transaction_id": "tx_002",
    "user": {
      "id": 102,
      "name": "Bob",
      "tier": "basic"
    },
    "transaction": {
      "amount": 75.50,
      "currency": "USD",
      "timestamp": "2024-01-15T11:45:00Z"
    }
  }
]

Processing flattened data:

flat_data <- fromJSON("transactions.json", flatten = TRUE)

# Column names are automatically created
colnames(flat_data)
# [1] "transaction_id" "user.id" "user.name" "user.tier" 
# [5] "transaction.amount" "transaction.currency" "transaction.timestamp"

# Filter premium users
premium_txns <- flat_data[flat_data$user.tier == "premium", ]

# Calculate total by tier
library(dplyr)
tier_summary <- flat_data %>%
  group_by(user.tier) %>%
  summarise(
    total_amount = sum(transaction.amount),
    transaction_count = n()
  )

read_json vs fromJSON

The read_json() function provides more control over type conversion:

# read_json returns nested lists, no automatic simplification
list_data <- read_json("data.json")

# fromJSON attempts to create data frames
df_data <- fromJSON("data.json")

# Explicit simplification control
df_explicit <- fromJSON("data.json", simplifyVector = TRUE)
list_explicit <- fromJSON("data.json", simplifyVector = FALSE)

Use read_json() when:

  • You need precise control over data structure
  • JSON schema is inconsistent
  • You’ll manually transform the data

Use fromJSON() when:

  • JSON represents tabular data
  • You want automatic data frame conversion
  • Structure is consistent across records

Handling Large JSON Files

For files that exceed available memory, use stream_in() with line-delimited JSON:

# Stream processing
con <- file("large_dataset.ndjson", "r")
data <- stream_in(con, pagesize = 1000)
close(con)

# Process in chunks
process_chunk <- function(chunk) {
  # Your processing logic
  filtered <- chunk[chunk$value > 100, ]
  return(filtered)
}

con <- file("large_dataset.ndjson", "r")
results <- list()
repeat {
  chunk <- stream_in(con, pagesize = 5000, verbose = FALSE)
  if (nrow(chunk) == 0) break
  results[[length(results) + 1]] <- process_chunk(chunk)
}
close(con)

final_data <- do.call(rbind, results)

Example large_dataset.ndjson (newline-delimited JSON):

{"id": 1, "value": 150, "category": "A"}
{"id": 2, "value": 75, "category": "B"}
{"id": 3, "value": 200, "category": "A"}

Error Handling and Validation

Implement robust error handling for production code:

safe_read_json <- function(filepath) {
  tryCatch({
    data <- fromJSON(filepath)
    
    # Validate required fields
    required_cols <- c("id", "timestamp", "value")
    missing_cols <- setdiff(required_cols, colnames(data))
    
    if (length(missing_cols) > 0) {
      stop(paste("Missing required columns:", 
                 paste(missing_cols, collapse = ", ")))
    }
    
    return(data)
    
  }, error = function(e) {
    message(sprintf("Failed to read %s: %s", filepath, e$message))
    return(NULL)
  })
}

# Usage
data <- safe_read_json("api_response.json")
if (!is.null(data)) {
  # Process data
}

Validate JSON structure before processing:

library(jsonlite)

validate_json_file <- function(filepath) {
  # Check file exists
  if (!file.exists(filepath)) {
    return(list(valid = FALSE, error = "File not found"))
  }
  
  # Check valid JSON
  tryCatch({
    jsonlite::validate(readLines(filepath, warn = FALSE))
    return(list(valid = TRUE, error = NULL))
  }, error = function(e) {
    return(list(valid = FALSE, error = e$message))
  })
}

# Validate before reading
validation <- validate_json_file("data.json")
if (validation$valid) {
  data <- fromJSON("data.json")
} else {
  stop(paste("Invalid JSON:", validation$error))
}

Working with JSON from APIs

Reading JSON directly from HTTP endpoints:

# Direct URL reading
api_data <- fromJSON("https://api.example.com/v1/data")

# With authentication
library(httr)

response <- GET(
  "https://api.example.com/v1/data",
  add_headers(Authorization = paste("Bearer", api_token))
)

if (status_code(response) == 200) {
  data <- fromJSON(content(response, "text", encoding = "UTF-8"))
} else {
  stop(sprintf("API request failed with status %d", status_code(response)))
}

# Handling paginated responses
fetch_all_pages <- function(base_url, token) {
  all_data <- list()
  page <- 1
  
  repeat {
    url <- sprintf("%s?page=%d", base_url, page)
    response <- GET(url, add_headers(Authorization = paste("Bearer", token)))
    
    if (status_code(response) != 200) break
    
    page_data <- fromJSON(content(response, "text"))
    if (length(page_data$results) == 0) break
    
    all_data[[page]] <- page_data$results
    page <- page + 1
  }
  
  return(do.call(rbind, all_data))
}

Performance Optimization

For repeated JSON operations, consider these optimizations:

# Pre-allocate when possible
n_files <- length(json_files)
data_list <- vector("list", n_files)

for (i in seq_along(json_files)) {
  data_list[[i]] <- fromJSON(json_files[i])
}

combined_data <- do.call(rbind, data_list)

# Parallel processing for multiple files
library(parallel)

cl <- makeCluster(detectCores() - 1)
clusterEvalQ(cl, library(jsonlite))

json_files <- list.files("data/", pattern = "\\.json$", full.names = TRUE)
data_list <- parLapply(cl, json_files, fromJSON)
stopCluster(cl)

final_data <- do.call(rbind, data_list)

The jsonlite package handles most JSON parsing scenarios efficiently. Use fromJSON() for standard cases, stream_in() for large files, and implement validation for production systems. The automatic type conversion and flattening capabilities eliminate most manual data wrangling, letting you focus on analysis rather than parsing.

Liked this? There's more.

Every week: one practical technique, explained simply, with code you can use immediately.