I am using R Studio for class examples which is more user friendly. The computation core is still R. It is okay if you want to use and familiar with others, but I may not provide full assistance. R can be downloaded from https://www.r-project.org/. RStudio is from https://rstudio.com/.
# setwd("G:/Desktop/rsite/stat464") # set working directory
# install.packages("MPV")
library(MPV) # data in the appendix
## Loading required package: lattice
## Loading required package: KernSmooth
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
# install.packages("scatterplot3d") # Install
library("scatterplot3d") # load
# install.packages("installr")
library(installr)
##
## Welcome to installr version 0.23.2
##
## More information is available on the installr project website:
## https://github.com/talgalili/installr/
##
## Contact: <tal.galili@gmail.com>
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/installr/issues
##
## To suppress this message use:
## suppressPackageStartupMessages(library(installr))
#install.packages("xlsx")
library(readxl)
p2.7
## purity hydro
## 1 86.91 1.02
## 2 89.85 1.11
## 3 90.28 1.43
## 4 86.34 1.11
## 5 92.58 1.01
## 6 87.33 0.95
## 7 86.29 1.11
## 8 91.86 0.87
## 9 95.61 1.43
## 10 89.86 1.02
## 11 96.73 1.46
## 12 99.42 1.55
## 13 98.66 1.55
## 14 96.07 1.55
## 15 93.65 1.40
## 16 87.31 1.15
## 17 95.00 1.01
## 18 96.85 0.99
## 19 85.20 0.95
## 20 90.56 0.98
head(p2.7)
## purity hydro
## 1 86.91 1.02
## 2 89.85 1.11
## 3 90.28 1.43
## 4 86.34 1.11
## 5 92.58 1.01
## 6 87.33 0.95
example21 = read_excel("data/Chapter 2/Examples/data-ex-2-1 (Rocket Prop).xls", col_names =T) # read .xls file from local directory
colnames(example21) = c("Observation","Shear_Strength_psi","Age_of_Propellant_weeks")
example21
## # A tibble: 20 × 3
## Observation Shear_Strength_psi Age_of_Propellant_weeks
## <dbl> <dbl> <dbl>
## 1 1 2159. 15.5
## 2 2 1678. 23.8
## 3 3 2316 8
## 4 4 2061. 17
## 5 5 2208. 5.5
## 6 6 1708. 19
## 7 7 1785. 24
## 8 8 2575 2.5
## 9 9 2358. 7.5
## 10 10 2257. 11
## 11 11 2165. 13
## 12 12 2400. 3.75
## 13 13 1780. 25
## 14 14 2337. 9.75
## 15 15 1765. 22
## 16 16 2054. 18
## 17 17 2414. 6
## 18 18 2200. 12.5
## 19 19 2654. 2
## 20 20 1754. 21.5
head(example21)
## # A tibble: 6 × 3
## Observation Shear_Strength_psi Age_of_Propellant_weeks
## <dbl> <dbl> <dbl>
## 1 1 2159. 15.5
## 2 2 1678. 23.8
## 3 3 2316 8
## 4 4 2061. 17
## 5 5 2208. 5.5
## 6 6 1708. 19
attach(example21)
y = example21$Shear_Strength_psi
x = example21$Age_of_Propellant_weeks
n = length(x)
plot(Age_of_Propellant_weeks,Shear_Strength_psi,pch = 16)
want to study the linear relationship between the age of propellant and sheer strength
can draw a picture of it and assume linear model is a good model
Our model is \[ Y = \beta_0+\beta_1 X+\epsilon. \]
Here, \(Y\) is Shear_Strength_psi and \(X\) is Age_of_Propellant_weeks. We can compute what we want directly from definitaions \[ S_{xx} = \sum_{i=1}^n(x_i-\bar{x})^2\\ S_{xy} = \sum_{i = 1}^ny_i(x_i-\bar{x})=\sum_{i=1}^nx_iy_i-\bar{x}\sum_{i=1}^ny_i\\ \hat{\beta}_1 = \frac{S_{xy}}{S_{xx}}\\ \hat{\beta}_0 = \bar{y}-\hat{\beta}_1\bar{x}. \]
sxx = sum(x^2)-sum(x)^2/n;sxx
## [1] 1106.559
sxy = sum(x*y)-sum(x)*sum(y)/n;sxy
## [1] -41112.65
beta1 = sxy/sxx;beta1
## [1] -37.15359
beta0 = mean(y)-beta1*mean(x)
c(beta0,beta1)
## [1] 2627.82236 -37.15359
error = y-(beta0+beta1*x)
plot(error);hist(error)
We can use R packages to do most of the work in the future. Most softwares have packages for linear model. In lm function, we have everything we want in the output summary.
# first state modle (y response ~ x, data)
# when there are multiple will see + sign with multiple responses
model1 = lm(Shear_Strength_psi ~ Age_of_Propellant_weeks,data = example21)
summary(model1)
##
## Call:
## lm(formula = Shear_Strength_psi ~ Age_of_Propellant_weeks, data = example21)
##
## Residuals:
## Min 1Q Median 3Q Max
## -215.98 -50.68 28.74 66.61 106.76
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2627.822 44.184 59.48 < 2e-16 ***
## Age_of_Propellant_weeks -37.154 2.889 -12.86 1.64e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 96.11 on 18 degrees of freedom
## Multiple R-squared: 0.9018, Adjusted R-squared: 0.8964
## F-statistic: 165.4 on 1 and 18 DF, p-value: 1.643e-10
will use lm() function a lot
what we care about in the summary (right now) are the Coefficients
for coefficients we have intercept (beta 0) and what we define as out slope (beta 1)
response is decreasing with respect to our ???
job of statistician is to prove that this slope and intercept are indeed true and not just two random numbers
F-test tells us if the whole model works good
For example, we know the variance of slope \(\beta_0\) is \[Var(\hat{\beta_1})=\frac{\sigma^2}{S_{xx}}\]. The test statistic of \(\beta_0\) is \[ t_0 = \frac{\hat{\beta_0}-\beta_{00}}{\sqrt{MS_{Res}(1/n+\bar{x}^2/S_{xx})}}. \] p-value is given by \[P(T>t_0).\] Let’s do it by hand.
sst = sum((y-mean(y))^2);sst
## [1] 1693738
ssres = sum(error^2);ssres
## [1] 166254.9
varBeta1 = (ssres/(n-2))/sxx;sqrt(varBeta1) # variance of \beta_1
## [1] 2.889107
sdOfBeta1 = sqrt(ssres/(n-2)*(1/sxx))
sdOfBeta0 = sqrt(ssres/(n-2)*(1/n+mean(x)^2/sxx))
t0 = (beta0-0)/sdOfBeta0;t0
## [1] 59.47464
pt(-t0,n-2)
## [1] 2.03178e-22
Similarly, lm returns everything else as we want. (We will meet Adjusted R-squared later).
Let \(\alpha = 0.05\), the \((1-\alpha)100\%=95\%\) confidence interval (C.I.) of \(\beta_1\) is \((-43.22 -31.08)\), given by
c(beta1+qt(0.025,n-2)*sdOfBeta1,beta1-qt(0.025,n-2)*sdOfBeta1)
## [1] -43.22338 -31.08380
We predict the average response at point \(x_0\) with its corresponding confidence interval. For all \(x_i,i=1,...,n\), we can create a confidence band.
yhat = beta0+beta1*x
plot(x,y,pch=16)
lines(x,yhat)
x0 = 15;yhat0 = beta0+beta1*x0
ciPoint = c(yhat0-qt(0.025,n-2)*sqrt(ssres/(n-2)*(1+1/n+(x0-mean(x))^2/sxx)),
yhat0+qt(0.025,n-2)*sqrt(ssres/(n-2)*(1+1/n+(x0-mean(x))^2/sxx)))
points(x0,yhat0,pch=2,cex=2);points(c(x0,x0),ciPoint,pch = 8)
yhatSort = beta0+beta1*sort(x)
ciBand = cbind(yhatSort-qt(0.025,n-2)*sqrt(ssres/(n-2)*(1/n+(sort(x)-mean(x))^2/sxx)),
yhatSort+qt(0.025,n-2)*sqrt(ssres/(n-2)*(1/n+(sort(x)-mean(x))^2/sxx)))
ciPredBand = cbind(yhatSort-qt(0.025,n-2)*sqrt(ssres/(n-2)*(1+1/n+(sort(x)-mean(x))^2/sxx)),
yhatSort+qt(0.025,n-2)*sqrt(ssres/(n-2)*(1+1/n+(sort(x)-mean(x))^2/sxx)))
plot(x,y,pch=16)
lines(x,yhat,lwd = 2)
lines(sort(x),ciBand[,1],type = 'l',lty = 'dashed',lwd = 2)
lines(sort(x),ciBand[,2],type = 'l',lty = 'dashed',lwd = 2)
lines(sort(x),ciPredBand[,1],type = 'l',lty = 'dotted',lwd = 2)
lines(sort(x),ciPredBand[,2],type = 'l',lty = 'dotted',lwd = 2)
Of course, we have simple way.
confint(model1)
## 2.5 % 97.5 %
## (Intercept) 2534.99540 2720.6493
## Age_of_Propellant_weeks -43.22338 -31.0838
newData = data.frame(Age_of_Propellant_weeks = sort(Age_of_Propellant_weeks))
bandOfConf = predict(model1, newdata = newData, interval = 'confidence')# "confidence"
bandOfPred = predict(model1, newdata = newData, interval = 'prediction')# "confidence"
xOrdered = newData$Age_of_Propellant_weeks
plot(x,y)
abline(model1, col = "red")
lines(xOrdered,bandOfConf[,2],type = 'l',lty = 'dashed',lwd = 2)
lines(xOrdered,bandOfConf[,3],type = 'l',lty = 'dashed',lwd = 2)
lines(xOrdered,bandOfPred[,2],type = 'l',lty = 'dotted',lwd = 2)
lines(xOrdered,bandOfPred[,3],type = 'l',lty = 'dotted',lwd = 2)
If we want to estimate the line through the origin, we can do the following