R - Complete Tutorial for Beginners | Application Architect

Key Insights

• R is a specialized language for statistical computing and data visualization, with a syntax optimized for vectorized operations that eliminate most explicit loops • The tidyverse ecosystem provides a consistent, pipe-based workflow that dramatically improves code readability compared to base R approaches • Understanding data frames, factors, and R’s functional programming features is essential for effective data manipulation and analysis

Installing R and RStudio

Download R from CRAN (https://cran.r-project.org/) and RStudio Desktop from Posit. RStudio provides an integrated development environment with script editor, console, environment viewer, and plot display.

After installation, verify your setup:

# Check R version
R.version.string

# Install essential packages
install.packages(c("tidyverse", "data.table", "ggplot2"))

# Load a package
library(tidyverse)

Basic Data Types and Structures

R has six atomic data types: numeric, integer, character, logical, complex, and raw. Most work involves the first four.

# Numeric (double precision by default)
x <- 42.5
class(x)  # "numeric"

# Integer (use L suffix)
y <- 42L
class(y)  # "integer"

# Character
name <- "Alice"

# Logical
is_active <- TRUE

# Vectors (homogeneous collections)
numbers <- c(1, 2, 3, 4, 5)
names <- c("Alice", "Bob", "Charlie")

# Named vectors
ages <- c(Alice = 25, Bob = 30, Charlie = 35)
ages["Alice"]  # 25

Vectorized Operations

R excels at vectorized operations, applying functions to entire vectors without explicit loops:

# Arithmetic operations
x <- c(1, 2, 3, 4, 5)
y <- c(10, 20, 30, 40, 50)

x + y          # Element-wise addition
x * 2          # Scalar multiplication
x^2            # Element-wise squaring
sqrt(x)        # Element-wise square root

# Logical operations
x > 3          # c(FALSE, FALSE, FALSE, TRUE, TRUE)
x[x > 3]       # c(4, 5) - subsetting

# Recycling rule (shorter vector repeats)
c(1, 2, 3, 4) + c(10, 20)  # c(11, 22, 13, 24)

Data Frames

Data frames are the primary structure for tabular data, combining vectors of equal length:

# Create a data frame
employees <- data.frame(
  name = c("Alice", "Bob", "Charlie", "Diana"),
  age = c(25, 30, 35, 28),
  salary = c(50000, 60000, 75000, 55000),
  department = c("IT", "HR", "IT", "Finance")
)

# Access columns
employees$name
employees[["age"]]
employees[, "salary"]

# Access rows
employees[1, ]           # First row
employees[employees$age > 28, ]  # Filter rows

# Add new column
employees$bonus <- employees$salary * 0.1

# Summary statistics
summary(employees)
str(employees)  # Structure

Working with Factors

Factors represent categorical data with predefined levels:

# Create factor
dept <- factor(c("IT", "HR", "IT", "Finance", "HR"))
levels(dept)  # "Finance" "HR" "IT" (alphabetical)

# Ordered factors
satisfaction <- factor(
  c("High", "Low", "Medium", "High", "Medium"),
  levels = c("Low", "Medium", "High"),
  ordered = TRUE
)

# Factor operations
table(dept)  # Frequency count
summary(dept)

# Convert to character
as.character(dept)

Tidyverse Data Manipulation

The tidyverse provides dplyr for data manipulation with intuitive verbs:

library(dplyr)

# Sample data
sales <- data.frame(
  product = c("A", "B", "A", "C", "B", "A"),
  region = c("North", "North", "South", "North", "South", "South"),
  revenue = c(1000, 1500, 1200, 800, 1600, 1100),
  units = c(10, 15, 12, 8, 16, 11)
)

# Filter rows
sales %>% 
  filter(revenue > 1000)

# Select columns
sales %>% 
  select(product, revenue)

# Create new columns
sales %>% 
  mutate(price_per_unit = revenue / units)

# Group and summarize
sales %>% 
  group_by(product) %>% 
  summarize(
    total_revenue = sum(revenue),
    avg_units = mean(units),
    count = n()
  )

# Chain operations
sales %>% 
  filter(region == "North") %>% 
  group_by(product) %>% 
  summarize(total = sum(revenue)) %>% 
  arrange(desc(total))

Reading and Writing Data

R handles multiple data formats:

# CSV files
write.csv(employees, "employees.csv", row.names = FALSE)
data <- read.csv("employees.csv")

# Tidyverse alternative (faster, better defaults)
library(readr)
write_csv(employees, "employees.csv")
data <- read_csv("employees.csv")

# Excel files
library(readxl)
excel_data <- read_excel("data.xlsx", sheet = "Sheet1")

# RDS (R native format, preserves types)
saveRDS(employees, "employees.rds")
employees <- readRDS("employees.rds")

Control Structures

While vectorization is preferred, control structures are sometimes necessary:

# If-else
x <- 5
if (x > 0) {
  print("Positive")
} else if (x < 0) {
  print("Negative")
} else {
  print("Zero")
}

# Vectorized ifelse
numbers <- c(-2, 0, 3, -5, 7)
ifelse(numbers > 0, "Positive", "Non-positive")

# For loop (avoid when possible)
for (i in 1:5) {
  print(i^2)
}

# While loop
counter <- 1
while (counter <= 5) {
  print(counter)
  counter <- counter + 1
}

Functions

Functions encapsulate reusable logic:

# Basic function
calculate_bmi <- function(weight_kg, height_m) {
  bmi <- weight_kg / (height_m^2)
  return(bmi)
}

calculate_bmi(70, 1.75)  # 22.86

# Default arguments
greet <- function(name, greeting = "Hello") {
  paste(greeting, name)
}

greet("Alice")              # "Hello Alice"
greet("Bob", "Hi")          # "Hi Bob"

# Multiple return values (use list)
stats <- function(x) {
  list(
    mean = mean(x),
    median = median(x),
    sd = sd(x)
  )
}

result <- stats(c(1, 2, 3, 4, 5))
result$mean

Data Visualization with ggplot2

ggplot2 uses a grammar of graphics approach:

library(ggplot2)

# Scatter plot
ggplot(employees, aes(x = age, y = salary)) +
  geom_point(size = 3, color = "blue") +
  labs(title = "Age vs Salary", x = "Age", y = "Salary") +
  theme_minimal()

# Bar chart
ggplot(employees, aes(x = department, fill = department)) +
  geom_bar() +
  labs(title = "Employees by Department") +
  theme_classic()

# Box plot with grouping
ggplot(employees, aes(x = department, y = salary, fill = department)) +
  geom_boxplot() +
  labs(title = "Salary Distribution by Department") +
  theme_bw()

# Line plot with multiple series
time_series <- data.frame(
  month = rep(1:12, 2),
  product = rep(c("A", "B"), each = 12),
  sales = c(rnorm(12, 100, 10), rnorm(12, 120, 15))
)

ggplot(time_series, aes(x = month, y = sales, color = product)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  scale_x_continuous(breaks = 1:12) +
  labs(title = "Monthly Sales by Product")

Statistical Analysis

R provides comprehensive statistical functions:

# Descriptive statistics
data <- c(23, 45, 67, 34, 56, 78, 90, 12, 34, 56)
mean(data)
median(data)
sd(data)
var(data)
quantile(data, c(0.25, 0.75))

# Correlation
x <- c(1, 2, 3, 4, 5)
y <- c(2, 4, 5, 4, 5)
cor(x, y)

# Linear regression
model <- lm(salary ~ age, data = employees)
summary(model)
coef(model)
predict(model, newdata = data.frame(age = c(27, 32)))

# T-test
group1 <- c(23, 25, 27, 29, 31)
group2 <- c(33, 35, 37, 39, 41)
t.test(group1, group2)

Next Steps

Master R by working with real datasets. Practice data cleaning, exploratory analysis, and visualization. Explore specialized packages for your domain: caret for machine learning, sf for spatial data, or Shiny for web applications. The R community provides extensive documentation and package vignettes—use them liberally.