R - Linear Regression (lm)
The `lm()` function fits linear models using the formula interface `y ~ x1 + x2 + ...`. The function returns a model object containing coefficients, residuals, fitted values, and statistical...
Key Insights
- The
lm()function in R implements ordinary least squares regression with automatic handling of categorical variables, interaction terms, and polynomial features through formula syntax - Model diagnostics require checking four key assumptions: linearity, independence, homoscedasticity, and normality of residuals using built-in plotting functions
- R’s formula interface supports complex model specifications including transformations, interactions, and nested terms without manual matrix construction
Basic Linear Regression Syntax
The lm() function fits linear models using the formula interface y ~ x1 + x2 + .... The function returns a model object containing coefficients, residuals, fitted values, and statistical summaries.
# Load built-in dataset
data(mtcars)
# Simple linear regression
model_simple <- lm(mpg ~ wt, data = mtcars)
# Display coefficients
print(coef(model_simple))
# (Intercept) wt
# 37.285126 -5.344472
# Comprehensive summary
summary(model_simple)
The summary output provides R-squared values, F-statistics, p-values, and coefficient standard errors. Extract specific elements using accessor functions:
# Extract R-squared
summary(model_simple)$r.squared # 0.7528
# Get residuals and fitted values
residuals(model_simple)
fitted(model_simple)
# Confidence intervals for coefficients
confint(model_simple, level = 0.95)
Multiple Regression and Variable Selection
Multiple regression includes several predictors. R handles multicollinearity detection through variance inflation factors (VIF).
# Multiple regression
model_multi <- lm(mpg ~ wt + hp + cyl, data = mtcars)
summary(model_multi)
# Check for multicollinearity
library(car)
vif(model_multi)
# wt hp cyl
# 4.868814 2.894373 5.410091
# Standardized coefficients for comparison
library(lm.beta)
lm.beta(model_multi)
Compare nested models using ANOVA:
model1 <- lm(mpg ~ wt, data = mtcars)
model2 <- lm(mpg ~ wt + hp, data = mtcars)
model3 <- lm(mpg ~ wt + hp + cyl, data = mtcars)
# Test if additional predictors improve fit
anova(model1, model2, model3)
Automated variable selection using stepwise regression:
# Backward elimination
full_model <- lm(mpg ~ ., data = mtcars)
step_model <- step(full_model, direction = "backward")
# AIC-based selection
library(MASS)
stepAIC(full_model, direction = "both")
Categorical Variables and Interactions
R automatically converts factor variables into dummy variables using treatment contrasts by default.
# Convert to factor
mtcars$cyl_factor <- factor(mtcars$cyl)
# Regression with categorical variable
model_cat <- lm(mpg ~ wt + cyl_factor, data = mtcars)
summary(model_cat)
# Change reference level
mtcars$cyl_factor <- relevel(mtcars$cyl_factor, ref = "8")
model_cat_relevel <- lm(mpg ~ wt + cyl_factor, data = mtcars)
# Interaction terms
model_interact <- lm(mpg ~ wt * cyl_factor, data = mtcars)
# Equivalent to: mpg ~ wt + cyl_factor + wt:cyl_factor
# Interaction between continuous variables
model_cont_interact <- lm(mpg ~ wt * hp, data = mtcars)
Visualize interactions using effect plots:
library(effects)
plot(allEffects(model_interact))
# Manual interaction plot
library(ggplot2)
mtcars$cyl_factor <- factor(mtcars$cyl)
ggplot(mtcars, aes(x = wt, y = mpg, color = cyl_factor)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE)
Polynomial and Transformed Variables
Transform variables directly in the formula or create new columns for interpretability.
# Polynomial regression
model_poly <- lm(mpg ~ poly(wt, 2), data = mtcars)
# poly() creates orthogonal polynomials
# Raw polynomials (easier interpretation)
model_raw_poly <- lm(mpg ~ wt + I(wt^2), data = mtcars)
# Logarithmic transformation
model_log <- lm(log(mpg) ~ log(wt) + hp, data = mtcars)
# Square root transformation
model_sqrt <- lm(mpg ~ sqrt(wt) + hp, data = mtcars)
The I() function treats expressions literally, preventing formula interpretation:
# Without I(), this would be interaction
model_wrong <- lm(mpg ~ wt^2, data = mtcars) # Just wt
# Correct: square the variable
model_correct <- lm(mpg ~ I(wt^2), data = mtcars)
# Multiple transformations
model_complex <- lm(mpg ~ wt + I(wt^2) + log(hp) + sqrt(qsec), data = mtcars)
Model Diagnostics
Check regression assumptions using diagnostic plots and statistical tests.
# Four standard diagnostic plots
par(mfrow = c(2, 2))
plot(model_multi)
par(mfrow = c(1, 1))
The four plots show:
- Residuals vs Fitted: Check linearity and homoscedasticity
- Q-Q Plot: Check normality of residuals
- Scale-Location: Check homoscedasticity
- Residuals vs Leverage: Identify influential points
Statistical tests for assumptions:
# Normality of residuals
shapiro.test(residuals(model_multi))
# Homoscedasticity (constant variance)
library(lmtest)
bptest(model_multi) # Breusch-Pagan test
# Autocorrelation (for time series)
dwtest(model_multi) # Durbin-Watson test
# Outliers and influential points
library(car)
outlierTest(model_multi)
influenceIndexPlot(model_multi)
# Cook's distance
cooks_d <- cooks.distance(model_multi)
influential <- cooks_d > 4 / nrow(mtcars)
sum(influential)
Prediction and Confidence Intervals
Generate predictions with confidence and prediction intervals.
# Create new data for prediction
new_data <- data.frame(
wt = c(2.5, 3.0, 3.5),
hp = c(100, 150, 200),
cyl = c(4, 6, 8)
)
# Point predictions
predictions <- predict(model_multi, newdata = new_data)
# Confidence intervals for mean response
conf_intervals <- predict(model_multi, newdata = new_data,
interval = "confidence", level = 0.95)
# Prediction intervals for individual observations
pred_intervals <- predict(model_multi, newdata = new_data,
interval = "prediction", level = 0.95)
print(pred_intervals)
# fit lwr upr
# 1 24.7332 18.9876 30.4788
# 2 21.5487 15.8234 27.2740
# 3 18.3642 12.6398 24.0886
Visualize predictions:
# Prediction plot for simple regression
plot(mtcars$wt, mtcars$mpg, xlab = "Weight", ylab = "MPG")
abline(model_simple, col = "red")
# Add prediction interval
wt_range <- seq(min(mtcars$wt), max(mtcars$wt), length.out = 100)
pred_data <- data.frame(wt = wt_range)
preds <- predict(model_simple, newdata = pred_data, interval = "prediction")
lines(wt_range, preds[, "lwr"], col = "blue", lty = 2)
lines(wt_range, preds[, "upr"], col = "blue", lty = 2)
Robust and Weighted Regression
Handle outliers and heteroscedasticity using robust methods.
# Weighted least squares
weights <- 1 / fitted(lm(abs(residuals(model_multi)) ~ fitted(model_multi)))^2
model_wls <- lm(mpg ~ wt + hp + cyl, data = mtcars, weights = weights)
# Robust regression (resistant to outliers)
library(MASS)
model_robust <- rlm(mpg ~ wt + hp + cyl, data = mtcars)
summary(model_robust)
# Compare coefficients
data.frame(
OLS = coef(model_multi),
Robust = coef(model_robust)
)
Handle missing data:
# Remove observations with missing values (default)
model_na_omit <- lm(mpg ~ wt + hp, data = mtcars, na.action = na.omit)
# Exclude specific observations
model_subset <- lm(mpg ~ wt + hp, data = mtcars, subset = (cyl != 8))
# Model without influential points
influential_idx <- which(cooks_d > 4 / nrow(mtcars))
model_clean <- lm(mpg ~ wt + hp + cyl, data = mtcars[-influential_idx, ])
The lm() function provides a complete framework for linear regression analysis in R. Proper model specification, diagnostic checking, and validation ensure reliable results for prediction and inference tasks.