R - Create Data Frame with Examples
The `data.frame()` function constructs a data frame from vectors. Each vector becomes a column, and all vectors must have equal length.
Key Insights
- Data frames are R’s fundamental two-dimensional data structure, storing columns of different types (numeric, character, logical) with equal length
- Creating data frames involves the
data.frame()function, vector construction, or conversion from other structures like matrices and lists - Modern R development often uses tibbles from the tidyverse as enhanced data frames with better printing and stricter subsetting behavior
Basic Data Frame Creation
The data.frame() function constructs a data frame from vectors. Each vector becomes a column, and all vectors must have equal length.
# Create a basic data frame
employees <- data.frame(
id = c(101, 102, 103, 104, 105),
name = c("Alice", "Bob", "Charlie", "Diana", "Eve"),
salary = c(75000, 82000, 68000, 91000, 73000),
active = c(TRUE, TRUE, FALSE, TRUE, TRUE)
)
print(employees)
# id name salary active
# 1 101 Alice 75000 TRUE
# 2 102 Bob 82000 TRUE
# 3 103 Charlie 68000 FALSE
# 4 104 Diana 91000 TRUE
# 5 105 Eve 73000 TRUE
# Check structure
str(employees)
# 'data.frame': 5 obs. of 4 variables:
# $ id : num 101 102 103 104 105
# $ name : chr "Alice" "Bob" "Charlie" "Diana" ...
# $ salary: num 75000 82000 68000 91000 73000
# $ active: logi TRUE TRUE FALSE TRUE TRUE
By default, data.frame() converts character vectors to factors in older R versions. Use stringsAsFactors = FALSE to prevent this behavior.
# Prevent automatic factor conversion
df <- data.frame(
category = c("A", "B", "A", "C"),
value = c(10, 20, 15, 25),
stringsAsFactors = FALSE
)
str(df)
# 'data.frame': 4 obs. of 2 variables:
# $ category: chr "A" "B" "A" "C"
# $ value : num 10 20 15 25
Creating Empty Data Frames
Initialize empty data frames for iterative population or when structure is known but data arrives later.
# Empty data frame with column types
empty_df <- data.frame(
product_id = integer(),
product_name = character(),
price = numeric(),
in_stock = logical(),
stringsAsFactors = FALSE
)
str(empty_df)
# 'data.frame': 0 obs. of 4 variables:
# $ product_id : int
# $ product_name: chr
# $ price : num
# $ in_stock : logi
# Add rows to empty data frame
empty_df <- rbind(empty_df, data.frame(
product_id = 1,
product_name = "Laptop",
price = 999.99,
in_stock = TRUE
))
Creating Data Frames from Sequences and Patterns
Generate data frames with patterned data using sequence functions and replication.
# Using sequences
time_series <- data.frame(
timestamp = seq(from = as.POSIXct("2024-01-01 00:00:00"),
to = as.POSIXct("2024-01-01 23:00:00"),
by = "hour"),
temperature = rnorm(24, mean = 20, sd = 3),
humidity = runif(24, min = 40, max = 80)
)
head(time_series, 3)
# timestamp temperature humidity
# 1 2024-01-01 00:00:00 18.73457 62.41534
# 2 2024-01-01 01:00:00 21.45632 55.78923
# 3 2024-01-01 02:00:00 19.87234 71.23456
# Using rep() for repeated patterns
repeated_data <- data.frame(
group = rep(c("Control", "Treatment"), each = 5),
replicate = rep(1:5, times = 2),
measurement = rnorm(10, mean = 100, sd = 15)
)
print(repeated_data)
# group replicate measurement
# 1 Control 1 98.45632
# 2 Control 2 112.34567
# ...
Converting Matrices to Data Frames
Transform matrices into data frames using as.data.frame(), which preserves row and column names.
# Create a matrix
mat <- matrix(1:12, nrow = 4, ncol = 3)
colnames(mat) <- c("Q1", "Q2", "Q3")
rownames(mat) <- c("Product_A", "Product_B", "Product_C", "Product_D")
print(mat)
# Q1 Q2 Q3
# Product_A 1 5 9
# Product_B 2 6 10
# Product_C 3 7 11
# Product_D 4 8 12
# Convert to data frame
df_from_mat <- as.data.frame(mat)
# Add row names as a column
df_from_mat$Product <- rownames(df_from_mat)
rownames(df_from_mat) <- NULL
print(df_from_mat)
# Q1 Q2 Q3 Product
# 1 1 5 9 Product_A
# 2 2 6 10 Product_B
# 3 3 7 11 Product_C
# 4 4 8 12 Product_D
Creating Data Frames from Lists
Lists provide flexible input for data frame creation, especially when working with nested structures.
# From named list
customer_list <- list(
customer_id = c(1001, 1002, 1003),
name = c("TechCorp", "DataInc", "CloudSys"),
revenue = c(150000, 230000, 180000),
tier = c("Gold", "Platinum", "Gold")
)
customers <- as.data.frame(customer_list)
print(customers)
# customer_id name revenue tier
# 1 1001 TechCorp 150000 Gold
# 2 1002 DataInc 230000 Platinum
# 3 1003 CloudSys 180000 Gold
# From list of lists (row-wise)
transactions <- list(
list(txn_id = "T001", amount = 250.50, status = "completed"),
list(txn_id = "T002", amount = 175.00, status = "pending"),
list(txn_id = "T003", amount = 320.75, status = "completed")
)
txn_df <- do.call(rbind, lapply(transactions, as.data.frame))
print(txn_df)
# txn_id amount status
# 1 T001 250.50 completed
# 2 T002 175.00 pending
# 3 T003 320.75 completed
Using read.table() for Manual Input
The read.table() function with text argument creates data frames from formatted strings.
# Create data frame from text
sales_data <- read.table(text = "
Region Product Sales
North Widget 1200
South Gadget 1500
East Widget 1100
West Gadget 1800
North Gadget 1350
", header = TRUE, stringsAsFactors = FALSE)
print(sales_data)
# Region Product Sales
# 1 North Widget 1200
# 2 South Gadget 1500
# 3 East Widget 1100
# 4 West Gadget 1800
# 5 North Gadget 1350
Creating Tibbles with tidyverse
Tibbles offer improved behavior over traditional data frames: better printing, no automatic type conversion, and stricter subsetting.
library(tibble)
# Create tibble
employees_tbl <- tibble(
id = 101:105,
name = c("Alice", "Bob", "Charlie", "Diana", "Eve"),
department = c("Engineering", "Sales", "Engineering", "Marketing", "Sales"),
salary = c(75000, 82000, 68000, 91000, 73000),
hire_date = as.Date(c("2020-01-15", "2019-06-01", "2021-03-10",
"2018-11-20", "2020-08-05"))
)
print(employees_tbl)
# # A tibble: 5 × 5
# id name department salary hire_date
# <int> <chr> <chr> <dbl> <date>
# 1 101 Alice Engineering 75000 2020-01-15
# 2 102 Bob Sales 82000 2019-06-01
# 3 103 Charlie Engineering 68000 2021-03-10
# 4 104 Diana Marketing 91000 2018-11-20
# 5 105 Eve Sales 73000 2020-08-05
# Tribble for row-wise creation
config_tbl <- tribble(
~parameter, ~value, ~type,
"max_retries", 3, "integer",
"timeout", 30, "integer",
"api_key", "abc123", "string",
"debug_mode", TRUE, "boolean"
)
print(config_tbl)
# # A tibble: 4 × 3
# parameter value type
# <chr> <chr> <chr>
# 1 max_retries 3 integer
# 2 timeout 30 integer
# 3 api_key abc123 string
# 4 debug_mode TRUE boolean
Expanding Grid for Combinations
The expand.grid() function generates data frames with all combinations of supplied vectors, useful for experimental designs and parameter grids.
# Create all combinations
experiment_design <- expand.grid(
temperature = c(20, 25, 30),
pressure = c(1, 2, 3),
catalyst = c("A", "B")
)
print(experiment_design)
# temperature pressure catalyst
# 1 20 1 A
# 2 25 1 A
# 3 30 1 A
# 4 20 2 A
# 5 25 2 A
# 6 30 2 A
# 7 20 3 A
# ...
# Add response variable
experiment_design$yield <- runif(nrow(experiment_design), 50, 95)
Data Frame with Row Names
While tibbles discourage row names, traditional data frames support them for certain use cases.
# Create data frame with row names
stock_prices <- data.frame(
open = c(150.25, 148.50, 151.00),
close = c(152.00, 149.75, 153.25),
volume = c(1500000, 1750000, 1600000),
row.names = c("2024-01-01", "2024-01-02", "2024-01-03")
)
print(stock_prices)
# open close volume
# 2024-01-01 150.25 152.00 1500000
# 2024-01-02 148.50 149.75 1750000
# 2024-01-03 151.00 153.25 1600000
# Access by row name
stock_prices["2024-01-02", ]
# open close volume
# 2024-01-02 148.5 149.75 1750000
Data frames form the backbone of data manipulation in R. Choose data.frame() for base R compatibility, tibbles for modern workflows with better defaults, and expand.grid() for systematic combination generation. Understanding these creation methods enables efficient data structure initialization for analysis pipelines.