R - Linear Regression (lm) | Application Architect

Key Insights

The lm() function in R implements ordinary least squares regression with automatic handling of categorical variables, interaction terms, and polynomial features through formula syntax
Model diagnostics require checking four key assumptions: linearity, independence, homoscedasticity, and normality of residuals using built-in plotting functions
R’s formula interface supports complex model specifications including transformations, interactions, and nested terms without manual matrix construction

Basic Linear Regression Syntax

The lm() function fits linear models using the formula interface y ~ x1 + x2 + .... The function returns a model object containing coefficients, residuals, fitted values, and statistical summaries.

# Load built-in dataset
data(mtcars)

# Simple linear regression
model_simple <- lm(mpg ~ wt, data = mtcars)

# Display coefficients
print(coef(model_simple))
# (Intercept)          wt 
#   37.285126   -5.344472

# Comprehensive summary
summary(model_simple)

The summary output provides R-squared values, F-statistics, p-values, and coefficient standard errors. Extract specific elements using accessor functions:

# Extract R-squared
summary(model_simple)$r.squared  # 0.7528

# Get residuals and fitted values
residuals(model_simple)
fitted(model_simple)

# Confidence intervals for coefficients
confint(model_simple, level = 0.95)

Multiple Regression and Variable Selection

Multiple regression includes several predictors. R handles multicollinearity detection through variance inflation factors (VIF).

# Multiple regression
model_multi <- lm(mpg ~ wt + hp + cyl, data = mtcars)
summary(model_multi)

# Check for multicollinearity
library(car)
vif(model_multi)
#       wt       hp      cyl 
# 4.868814 2.894373 5.410091

# Standardized coefficients for comparison
library(lm.beta)
lm.beta(model_multi)

Compare nested models using ANOVA:

model1 <- lm(mpg ~ wt, data = mtcars)
model2 <- lm(mpg ~ wt + hp, data = mtcars)
model3 <- lm(mpg ~ wt + hp + cyl, data = mtcars)

# Test if additional predictors improve fit
anova(model1, model2, model3)

Automated variable selection using stepwise regression:

# Backward elimination
full_model <- lm(mpg ~ ., data = mtcars)
step_model <- step(full_model, direction = "backward")

# AIC-based selection
library(MASS)
stepAIC(full_model, direction = "both")

Categorical Variables and Interactions

R automatically converts factor variables into dummy variables using treatment contrasts by default.

# Convert to factor
mtcars$cyl_factor <- factor(mtcars$cyl)

# Regression with categorical variable
model_cat <- lm(mpg ~ wt + cyl_factor, data = mtcars)
summary(model_cat)

# Change reference level
mtcars$cyl_factor <- relevel(mtcars$cyl_factor, ref = "8")
model_cat_relevel <- lm(mpg ~ wt + cyl_factor, data = mtcars)

# Interaction terms
model_interact <- lm(mpg ~ wt * cyl_factor, data = mtcars)
# Equivalent to: mpg ~ wt + cyl_factor + wt:cyl_factor

# Interaction between continuous variables
model_cont_interact <- lm(mpg ~ wt * hp, data = mtcars)

Visualize interactions using effect plots:

library(effects)
plot(allEffects(model_interact))

# Manual interaction plot
library(ggplot2)
mtcars$cyl_factor <- factor(mtcars$cyl)
ggplot(mtcars, aes(x = wt, y = mpg, color = cyl_factor)) +
  geom_point() +
  geom_smooth(method = "lm", se = TRUE)

Polynomial and Transformed Variables

Transform variables directly in the formula or create new columns for interpretability.

# Polynomial regression
model_poly <- lm(mpg ~ poly(wt, 2), data = mtcars)
# poly() creates orthogonal polynomials

# Raw polynomials (easier interpretation)
model_raw_poly <- lm(mpg ~ wt + I(wt^2), data = mtcars)

# Logarithmic transformation
model_log <- lm(log(mpg) ~ log(wt) + hp, data = mtcars)

# Square root transformation
model_sqrt <- lm(mpg ~ sqrt(wt) + hp, data = mtcars)

The I() function treats expressions literally, preventing formula interpretation:

# Without I(), this would be interaction
model_wrong <- lm(mpg ~ wt^2, data = mtcars)  # Just wt

# Correct: square the variable
model_correct <- lm(mpg ~ I(wt^2), data = mtcars)

# Multiple transformations
model_complex <- lm(mpg ~ wt + I(wt^2) + log(hp) + sqrt(qsec), data = mtcars)

Model Diagnostics

Check regression assumptions using diagnostic plots and statistical tests.

# Four standard diagnostic plots
par(mfrow = c(2, 2))
plot(model_multi)
par(mfrow = c(1, 1))

The four plots show:

Residuals vs Fitted: Check linearity and homoscedasticity
Q-Q Plot: Check normality of residuals
Scale-Location: Check homoscedasticity
Residuals vs Leverage: Identify influential points

Statistical tests for assumptions:

# Normality of residuals
shapiro.test(residuals(model_multi))

# Homoscedasticity (constant variance)
library(lmtest)
bptest(model_multi)  # Breusch-Pagan test

# Autocorrelation (for time series)
dwtest(model_multi)  # Durbin-Watson test

# Outliers and influential points
library(car)
outlierTest(model_multi)
influenceIndexPlot(model_multi)

# Cook's distance
cooks_d <- cooks.distance(model_multi)
influential <- cooks_d > 4 / nrow(mtcars)
sum(influential)

Prediction and Confidence Intervals

Generate predictions with confidence and prediction intervals.

# Create new data for prediction
new_data <- data.frame(
  wt = c(2.5, 3.0, 3.5),
  hp = c(100, 150, 200),
  cyl = c(4, 6, 8)
)

# Point predictions
predictions <- predict(model_multi, newdata = new_data)

# Confidence intervals for mean response
conf_intervals <- predict(model_multi, newdata = new_data, 
                          interval = "confidence", level = 0.95)

# Prediction intervals for individual observations
pred_intervals <- predict(model_multi, newdata = new_data,
                          interval = "prediction", level = 0.95)

print(pred_intervals)
#       fit      lwr      upr
# 1 24.7332 18.9876 30.4788
# 2 21.5487 15.8234 27.2740
# 3 18.3642 12.6398 24.0886

Visualize predictions:

# Prediction plot for simple regression
plot(mtcars$wt, mtcars$mpg, xlab = "Weight", ylab = "MPG")
abline(model_simple, col = "red")

# Add prediction interval
wt_range <- seq(min(mtcars$wt), max(mtcars$wt), length.out = 100)
pred_data <- data.frame(wt = wt_range)
preds <- predict(model_simple, newdata = pred_data, interval = "prediction")

lines(wt_range, preds[, "lwr"], col = "blue", lty = 2)
lines(wt_range, preds[, "upr"], col = "blue", lty = 2)

Robust and Weighted Regression

Handle outliers and heteroscedasticity using robust methods.

# Weighted least squares
weights <- 1 / fitted(lm(abs(residuals(model_multi)) ~ fitted(model_multi)))^2
model_wls <- lm(mpg ~ wt + hp + cyl, data = mtcars, weights = weights)

# Robust regression (resistant to outliers)
library(MASS)
model_robust <- rlm(mpg ~ wt + hp + cyl, data = mtcars)
summary(model_robust)

# Compare coefficients
data.frame(
  OLS = coef(model_multi),
  Robust = coef(model_robust)
)

Handle missing data:

# Remove observations with missing values (default)
model_na_omit <- lm(mpg ~ wt + hp, data = mtcars, na.action = na.omit)

# Exclude specific observations
model_subset <- lm(mpg ~ wt + hp, data = mtcars, subset = (cyl != 8))

# Model without influential points
influential_idx <- which(cooks_d > 4 / nrow(mtcars))
model_clean <- lm(mpg ~ wt + hp + cyl, data = mtcars[-influential_idx, ])

The lm() function provides a complete framework for linear regression analysis in R. Proper model specification, diagnostic checking, and validation ensure reliable results for prediction and inference tasks.