Purpose
DB - Fox - Regression. The usage of + - * / ^ in lm is going to be different. .Simulate a Multi reg '' .+

> n <- 1977
> set.seed(n)
> error.sd <- 3
> error <- sqrt(error.sd) * rnorm(n)
> true.beta <- rbind(2, 3, 4)
> indep <- cbind(rep(1, n), runif(n), runif(n))
> dep <- indep %*% true.beta + error
> fit <- lm(dep ~ indep[, 2] + indep[, 3])
> summary(fit)
Call:
lm(formula = dep ~ indep[, 2] + indep[, 3])
Residuals: Min 1Q Median 3Q Max -5.69660 -1.10317 0.02548 1.11955 5.30918
Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.9875 0.1014 19.60 <2e-16 *** indep[, 2] 2.8206 0.1320 21.37 <2e-16 *** indep[, 3] 4.1764 0.1340 31.18 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.709 on 1974 degrees of freedom Multiple R-squared: 0.4195, Adjusted R-squared: 0.4189 F-statistic: 713.2 on 2 and 1974 DF, p-value: < 2.2e-16
> n <- 1977
> set.seed(n)
> error.sd <- 3
> error <- sqrt(error.sd) * rnorm(n)
> true.beta <- rbind(2, 3, 4)
> indep <- cbind(rep(1, n), runif(n), runif(n))
> dep <- indep %*% true.beta + error
> fit <- lm(dep ~ indep[, 2] - indep[, 3])
> summary(fit)
Call:
lm(formula = dep ~ indep[, 2] - indep[, 3])
Residuals: Min 1Q Median 3Q Max -7.19413 -1.43267 -0.07039 1.44780 6.05450
Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 4.07416 0.09307 43.78 <2e-16 *** indep[, 2] 2.81388 0.16121 17.45 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.087 on 1975 degrees of freedom Multiple R-squared: 0.1336, Adjusted R-squared: 0.1332 F-statistic: 304.7 on 1 and 1975 DF, p-value: < 2.2e-16

> n <- 1977
> set.seed(n)
> error.sd <- 3
> error <- sqrt(error.sd) * rnorm(n)
> true.beta <- rbind(2, 3, 4)
> indep <- cbind(rep(1, n), runif(n), runif(n))
> dep <- indep %*% true.beta + error
> fit <- lm(dep ~ indep[, 2] * indep[, 3])
> summary(fit)
Call:
lm(formula = dep ~ indep[, 2] * indep[, 3])
Residuals: Min 1Q Median 3Q Max -5.69500 -1.10398 0.02508 1.12103 5.30697
Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.99090 0.15574 12.783 <2e-16 *** indep[, 2] 2.81386 0.26792 10.502 <2e-16 *** indep[, 3] 4.16967 0.26972 15.459 <2e-16 *** indep[, 2]:indep[, 3] 0.01331 0.46198 0.029 0.977 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.709 on 1973 degrees of freedom Multiple R-squared: 0.4195, Adjusted R-squared: 0.4186 F-statistic: 475.2 on 3 and 1973 DF, p-value: < 2.2e-16

/

> n <- 1977
> set.seed(n)
> error.sd <- 3
> error <- sqrt(error.sd) * rnorm(n)
> true.beta <- rbind(2, 3, 4)
> indep <- cbind(rep(1, n), runif(n), runif(n))
> dep <- indep %*% true.beta + error
> fit <- lm(dep ~ indep[, 2]/indep[, 3])
> summary(fit)
Call:
lm(formula = dep ~ indep[, 2]/indep[, 3])
Residuals: Min 1Q Median 3Q Max -6.45644 -1.17190 0.04854 1.16028 5.65789
Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 4.09045 0.08069 50.694 <2e-16 *** indep[, 2] -0.31556 0.18579 -1.698 0.0896 . indep[, 2]:indep[, 3] 6.21142 0.24296 25.566 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.81 on 1974 degrees of freedom Multiple R-squared: 0.3491, Adjusted R-squared: 0.3485 F-statistic: 529.5 on 2 and 1974 DF, p-value: < 2.2e-16

^

> n <- 1977
> set.seed(n)
> error.sd <- 3
> error <- sqrt(error.sd) * rnorm(n)
> true.beta <- rbind(2, 3, 4)
> indep <- cbind(rep(1, n), runif(n), runif(n))
> dep <- indep %*% true.beta + error
> fit <- lm(dep ~ I(indep[, 2]^2))
> summary(fit)
Call:
lm(formula = dep ~ I(indep[, 2]^2))
Residuals: Min 1Q Median 3Q Max -7.07126 -1.46190 -0.08996 1.45188 6.24808
Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 4.57679 0.07029 65.11 <2e-16 *** I(indep[, 2]^2) 2.70063 0.15678 17.23 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.091 on 1975 degrees of freedom Multiple R-squared: 0.1306, Adjusted R-squared: 0.1302 F-statistic: 296.7 on 1 and 1975 DF, p-value: < 2.2e-16 > fit <- lm(dep ~ (indep[, 2]^2)) > summary(fit) Call: lm(formula = dep ~ (indep[, 2]^2))
Residuals: Min 1Q Median 3Q Max -7.19413 -1.43267 -0.07039 1.44780 6.05450
Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 4.07416 0.09307 43.78 <2e-16 *** indep[, 2] 2.81388 0.16121 17.45 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.087 on 1975 degrees of freedom Multiple R-squared: 0.1336, Adjusted R-squared: 0.1332 F-statistic: 304.7 on 1 and 1975 DF, p-value: < 2.2e-16

To remove a few observations from the old model

> n <- 1977
> set.seed(n)
> error.sd <- 3
> error <- sqrt(error.sd) * rnorm(n)
> true.beta <- rbind(2, 3, 4)
> indep <- cbind(rep(1, n), runif(n), runif(n))
> dep <- indep %*% true.beta + error
> fit <- lm(dep ~ indep[, 2] + indep[, 3])
> summary(fit)
Call:
lm(formula = dep ~ indep[, 2] + indep[, 3])
Residuals: Min 1Q Median 3Q Max -5.69660 -1.10317 0.02548 1.11955 5.30918
Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.9875 0.1014 19.60 <2e-16 *** indep[, 2] 2.8206 0.1320 21.37 <2e-16 *** indep[, 3] 4.1764 0.1340 31.18 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.709 on 1974 degrees of freedom Multiple R-squared: 0.4195, Adjusted R-squared: 0.4189 F-statistic: 713.2 on 2 and 1974 DF, p-value: < 2.2e-16 > summary(update(fit, subset = -10)) Call: lm(formula = dep ~ indep[, 2] + indep[, 3], subset = -10)
Residuals: Min 1Q Median 3Q Max -5.69632 -1.10368 0.02294 1.12023 5.31033
Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.9861 0.1015 19.57 <2e-16 *** indep[, 2] 2.8219 0.1321 21.37 <2e-16 *** indep[, 3] 4.1772 0.1340 31.17 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.709 on 1973 degrees of freedom Multiple R-squared: 0.4194, Adjusted R-squared: 0.4188 F-statistic: 712.7 on 2 and 1973 DF, p-value: < 2.2e-16

Want to do a standardized regression coeff

> z <- cbind(dep, indep)
> z <- scale(z)
> fit <- lm(z[, 1] ~ z[, 3] + z[, 4])
> summary(fit)
Call:
lm(formula = z[, 1] ~ z[, 3] + z[, 4])
Residuals: Min 1Q Median 3Q Max -2.54100 -0.49208 0.01136 0.49938 2.36819
Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 9.868e-19 1.714e-02 5.76e-17 1 z[, 3] 3.664e-01 1.715e-02 21.37 <2e-16 *** z[, 4] 5.346e-01 1.715e-02 31.18 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.7623 on 1974 degrees of freedom Multiple R-squared: 0.4195, Adjusted R-squared: 0.4189 F-statistic: 713.2 on 2 and 1974 DF, p-value: < 2.2e-16

Want to remove intercept from the model.

> z <- cbind(dep, indep)
> z <- scale(z)
> fit <- lm(z[, 1] ~ z[, 3] + z[, 4] - 1)
> summary(fit)
Call:
lm(formula = z[, 1] ~ z[, 3] + z[, 4] - 1)
Residuals: Min 1Q Median 3Q Max -2.54100 -0.49208 0.01136 0.49938 2.36819
Coefficients: Estimate Std. Error t value Pr(>|t|) z[, 3] 0.36644 0.01714 21.37 <2e-16 *** z[, 4] 0.53463 0.01714 31.18 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.7621 on 1975 degrees of freedom Multiple R-squared: 0.4195, Adjusted R-squared: 0.4189 F-statistic: 713.5 on 2 and 1975 DF, p-value: < 2.2e-16 > fit <- lm(z[, 1] ~ z[, 3] + z[, 4] + 0) > summary(fit) Call: lm(formula = z[, 1] ~ z[, 3] + z[, 4] + 0)
Residuals: Min 1Q Median 3Q Max -2.54100 -0.49208 0.01136 0.49938 2.36819
Coefficients: Estimate Std. Error t value Pr(>|t|) z[, 3] 0.36644 0.01714 21.37 <2e-16 *** z[, 4] 0.53463 0.01714 31.18 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.7621 on 1975 degrees of freedom Multiple R-squared: 0.4195, Adjusted R-squared: 0.4189 F-statistic: 713.5 on 2 and 1975 DF, p-value: < 2.2e-16

Change the dummy variable coding

> n <- 2000
> set.seed(n)
> error.sd <- 3
> error <- sqrt(error.sd) * rnorm(n)
> true.beta <- rbind(2, 3, 4)
> indep <- cbind(rep(1, n), runif(n), runif(n))
> dep <- indep %*% true.beta + error
> z <- as.data.frame(cbind(dep, indep))
> z$fac <- factor(rep(letters[1:4], 500))
> contrasts(z$fac)
  b c d
a 0 0 0
b 1 0 0
c 0 1 0
d 0 0 1

Change the base coding to

> contrasts(z$fac) <- contr.treatment(levels(z$fac), base = 2)
> contrasts(z$fac)
  a c d
a 1 0 0
b 0 0 0
c 0 1 0
d 0 0 1

Run a reg with factor a as base

> n <- 2000
> set.seed(n)
> error.sd <- 3
> error <- sqrt(error.sd) * rnorm(n)
> true.beta <- rbind(2, 3, 4)
> indep <- cbind(rep(1, n), runif(n), runif(n))
> dep <- indep %*% true.beta + error
> z <- as.data.frame(cbind(dep, indep))
> z$fac <- factor(rep(letters[1:4], 500))
> fit <- lm(z[, 1] ~ z[, 3] + z[, 4] + z$fac)
> summary(fit)
Call:
lm(formula = z[, 1] ~ z[, 3] + z[, 4] + z$fac)
Residuals: Min 1Q Median 3Q Max -5.75244 -1.15317 -0.03433 1.18877 5.13664
Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 2.14092 0.11970 17.886 <2e-16 *** z[, 3] 2.99510 0.13357 22.423 <2e-16 *** z[, 4] 3.87625 0.13317 29.107 <2e-16 *** z$facb -0.03152 0.10841 -0.291 0.771 z$facc -0.01576 0.10838 -0.145 0.884 z$facd -0.09286 0.10838 -0.857 0.392 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.713 on 1994 degrees of freedom Multiple R-squared: 0.4063, Adjusted R-squared: 0.4048 F-statistic: 272.9 on 5 and 1994 DF, p-value: < 2.2e-16

Run a reg with factor b as base

> n <- 2000
> set.seed(n)
> error.sd <- 3
> error <- sqrt(error.sd) * rnorm(n)
> true.beta <- rbind(2, 3, 4)
> indep <- cbind(rep(1, n), runif(n), runif(n))
> dep <- indep %*% true.beta + error
> z <- as.data.frame(cbind(dep, indep))
> z$fac <- factor(rep(letters[1:4], 500))
> contrasts(z$fac) <- contr.treatment(levels(z$fac), base = 2)
> fit <- lm(z[, 1] ~ z[, 3] + z[, 4] + z$fac)
> summary(fit)
Call:
lm(formula = z[, 1] ~ z[, 3] + z[, 4] + z$fac)
Residuals: Min 1Q Median 3Q Max -5.75244 -1.15317 -0.03433 1.18877 5.13664
Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 2.10940 0.12157 17.351 <2e-16 *** z[, 3] 2.99510 0.13357 22.423 <2e-16 *** z[, 4] 3.87625 0.13317 29.107 <2e-16 *** z$faca 0.03152 0.10841 0.291 0.771 z$facc 0.01576 0.10840 0.145 0.884 z$facd -0.06135 0.10838 -0.566 0.571 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.713 on 1994 degrees of freedom Multiple R-squared: 0.4063, Adjusted R-squared: 0.4048 F-statistic: 272.9 on 5 and 1994 DF, p-value: < 2.2e-16

See the anova of the model

> anova(fit)
Analysis of Variance Table
Response: z[, 1] Df Sum Sq Mean Sq F value Pr(>F) z[, 3] 1 1516.2 1516.2 516.4704 <2e-16 *** z[, 4] 1 2487.2 2487.2 847.2168 <2e-16 *** z$fac 3 2.5 0.8 0.2813 0.839 Residuals 1994 5853.8 2.9 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  1. Best thing is to test the goodness of fit relating to two models
> fit1 <- lm(z[, 1] ~ z[, 3] + z[, 4])
> fit2 <- lm(z[, 1] ~ z[, 3] + z[, 4] + z$fac)
> anova(fit1, fit2)
Analysis of Variance Table
Model 1: z[, 1] ~ z[, 3] + z[, 4] Model 2: z[, 1] ~ z[, 3] + z[, 4] + z$fac Res.Df RSS Df Sum of Sq F Pr(>F) 1 1997 5856.3 2 1994 5853.8 3 2.5 0.2813 0.839

Basic Hypo testing

> library(car)
> attach(Duncan)
        The following object(s) are masked from Duncan ( position 3 ) :
education income prestige type
    The following object(s) are masked from Duncan ( position 4 ) :
education income prestige type
    The following object(s) are masked from package:robustbase :
education > fit <- lm(prestige ~ income + education) > linear.hypothesis(fit, c(0, 1, -1)) Linear hypothesis test
Hypothesis: income - education = 0
Model 1: prestige ~ income + education Model 2: restricted model
Res.Df RSS Df Sum of Sq F Pr(>F) 1 42 7506.7 2 43 7518.9 -1 -12.2 0.0682 0.7952 > linear.hypothesis(fit, c(0, -1, 1)) Linear hypothesis test
Hypothesis: -income + education = 0
Model 1: prestige ~ income + education Model 2: restricted model
Res.Df RSS Df Sum of Sq F Pr(>F) 1 42 7506.7 2 43 7518.9 -1 -12.2 0.0682 0.7952 > fit <- lm(prestige ~ income) > linear.hypothesis(fit, c(0, 1)) Linear hypothesis test
Hypothesis: income = 0
Model 1: prestige ~ income Model 2: restricted model
Res.Df RSS Df Sum of Sq F Pr(>F) 1 43 13023 2 44 43688 -1 -30665 101.25 7.144e-13 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 > linear.hypothesis(fit, diag(2), c(0, 1)) Linear hypothesis test
Hypothesis: (Intercept) = 0 income = 1
Model 1: prestige ~ income Model 2: restricted model
Res.Df RSS Df Sum of Sq F Pr(>F) 1 43 13022.8 2 45 14718.0 -2 -1695.2 2.7987 0.07201 . --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Check for elliptical plots between two normal distributions

> data.ellipse(rnorm(1000), rnorm(1000), levels = c(0.5, 0.75,
+     0.9, 0.95))

DB_May5_2010-016.jpg

Check for elliptical plots between a normal and a uniform distributions

> data.ellipse(rnorm(1000), runif(1000), levels = c(0.5, 0.75,
+     0.9, 0.95))

DB_May5_2010-017.jpg

Confidence plots for estimates

> fit <- lm(prestige ~ income + education)
> confidence.ellipse(fit, levels = c(0.5, 0.75, 0.9, 0.95))

DB_May5_2010-018.jpg

Arguments for lm

> args(lm)
function (formula, data, subset, weights, na.action, method = "qr",
    model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE,
    contrasts = NULL, offset, ...)
NULL

Use of Identity

> fit <- lm(prestige ~ I(income + education))
> summary(fit)
Call:
lm(formula = prestige ~ I(income + education))
Residuals: Min 1Q Median 3Q Max -29.6049 -6.7078 0.1235 6.9794 33.2895
Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -6.06319 4.22540 -1.435 0.159 I(income + education) 0.56927 0.03958 14.382 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 13.22 on 43 degrees of freedom Multiple R-squared: 0.8279, Adjusted R-squared: 0.8239 F-statistic: 206.8 on 1 and 43 DF, p-value: < 2.2e-16 > fit <- lm(prestige ~ I(income * education)) > summary(fit) Call: lm(formula = prestige ~ I(income * education))
Residuals: Min 1Q Median 3Q Max -30.559 -10.845 -1.661 5.826 49.967
Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.728e+01 3.425e+00 5.044 8.75e-06 *** I(income * education) 1.120e-02 9.385e-04 11.934 3.10e-15 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 15.35 on 43 degrees of freedom Multiple R-squared: 0.7681, Adjusted R-squared: 0.7627 F-statistic: 142.4 on 1 and 43 DF, p-value: 3.102e-15 > fit <- lm(prestige ~ (income * education)) > summary(fit) Call: lm(formula = prestige ~ (income * education))
Residuals: Min 1Q Median 3Q Max -29.1898 -6.2758 0.4532 7.0933 32.7585
Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -8.250934 7.378546 -1.118 0.26998 income 0.655621 0.197148 3.326 0.00187 ** education 0.606030 0.192363 3.150 0.00304 ** income:education -0.001237 0.003386 -0.365 0.71673 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 13.51 on 41 degrees of freedom Multiple R-squared: 0.8287, Adjusted R-squared: 0.8162 F-statistic: 66.13 on 3 and 41 DF, p-value: 9.286e-16 > fit <- lm(prestige ~ (income:education)) > summary(fit) Call: lm(formula = prestige ~ (income:education))
Residuals: Min 1Q Median 3Q Max -30.559 -10.845 -1.661 5.826 49.967
Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.728e+01 3.425e+00 5.044 8.75e-06 *** income:education 1.120e-02 9.385e-04 11.934 3.10e-15 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 15.35 on 43 degrees of freedom Multiple R-squared: 0.7681, Adjusted R-squared: 0.7627 F-statistic: 142.4 on 1 and 43 DF, p-value: 3.102e-15