Introduction of R

I am using R Studio for class examples which is more user friendly. The computation core is still R. It is okay if you want to use and familiar with others, but I may not provide full assistance. R can be downloaded from https://www.r-project.org/. RStudio is from https://rstudio.com/.

# setwd("G:/Desktop/rsite/stat464") # set working directory
# install.packages("MPV")
library(MPV) # data in the appendix
## Loading required package: lattice
## Loading required package: KernSmooth
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
# install.packages("scatterplot3d") # Install
library("scatterplot3d") # load
# install.packages("installr")
library(installr) 
## 
## Welcome to installr version 0.23.2
## 
## More information is available on the installr project website:
## https://github.com/talgalili/installr/
## 
## Contact: <tal.galili@gmail.com>
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/installr/issues
## 
##          To suppress this message use:
##          suppressPackageStartupMessages(library(installr))
#install.packages("xlsx")
library(readxl)
p2.7
##    purity hydro
## 1   86.91  1.02
## 2   89.85  1.11
## 3   90.28  1.43
## 4   86.34  1.11
## 5   92.58  1.01
## 6   87.33  0.95
## 7   86.29  1.11
## 8   91.86  0.87
## 9   95.61  1.43
## 10  89.86  1.02
## 11  96.73  1.46
## 12  99.42  1.55
## 13  98.66  1.55
## 14  96.07  1.55
## 15  93.65  1.40
## 16  87.31  1.15
## 17  95.00  1.01
## 18  96.85  0.99
## 19  85.20  0.95
## 20  90.56  0.98
head(p2.7)
##   purity hydro
## 1  86.91  1.02
## 2  89.85  1.11
## 3  90.28  1.43
## 4  86.34  1.11
## 5  92.58  1.01
## 6  87.33  0.95

Simple Linear Regression

Regression and Inference

example21 = read_excel("data/Chapter 2/Examples/data-ex-2-1 (Rocket Prop).xls", col_names =T) # read .xls file from local directory
colnames(example21) = c("Observation","Shear_Strength_psi","Age_of_Propellant_weeks")
example21
## # A tibble: 20 × 3
##    Observation Shear_Strength_psi Age_of_Propellant_weeks
##          <dbl>              <dbl>                   <dbl>
##  1           1              2159.                   15.5 
##  2           2              1678.                   23.8 
##  3           3              2316                     8   
##  4           4              2061.                   17   
##  5           5              2208.                    5.5 
##  6           6              1708.                   19   
##  7           7              1785.                   24   
##  8           8              2575                     2.5 
##  9           9              2358.                    7.5 
## 10          10              2257.                   11   
## 11          11              2165.                   13   
## 12          12              2400.                    3.75
## 13          13              1780.                   25   
## 14          14              2337.                    9.75
## 15          15              1765.                   22   
## 16          16              2054.                   18   
## 17          17              2414.                    6   
## 18          18              2200.                   12.5 
## 19          19              2654.                    2   
## 20          20              1754.                   21.5
head(example21)
## # A tibble: 6 × 3
##   Observation Shear_Strength_psi Age_of_Propellant_weeks
##         <dbl>              <dbl>                   <dbl>
## 1           1              2159.                    15.5
## 2           2              1678.                    23.8
## 3           3              2316                      8  
## 4           4              2061.                    17  
## 5           5              2208.                     5.5
## 6           6              1708.                    19
attach(example21)
y = example21$Shear_Strength_psi
x = example21$Age_of_Propellant_weeks
n = length(x)
plot(Age_of_Propellant_weeks,Shear_Strength_psi,pch = 16)

  • want to study the linear relationship between the age of propellant and sheer strength

  • can draw a picture of it and assume linear model is a good model

Our model is \[ Y = \beta_0+\beta_1 X+\epsilon. \]

Here, \(Y\) is Shear_Strength_psi and \(X\) is Age_of_Propellant_weeks. We can compute what we want directly from definitaions \[ S_{xx} = \sum_{i=1}^n(x_i-\bar{x})^2\\ S_{xy} = \sum_{i = 1}^ny_i(x_i-\bar{x})=\sum_{i=1}^nx_iy_i-\bar{x}\sum_{i=1}^ny_i\\ \hat{\beta}_1 = \frac{S_{xy}}{S_{xx}}\\ \hat{\beta}_0 = \bar{y}-\hat{\beta}_1\bar{x}. \]

sxx = sum(x^2)-sum(x)^2/n;sxx
## [1] 1106.559
sxy = sum(x*y)-sum(x)*sum(y)/n;sxy
## [1] -41112.65
beta1 = sxy/sxx;beta1
## [1] -37.15359
beta0 = mean(y)-beta1*mean(x)
c(beta0,beta1)
## [1] 2627.82236  -37.15359
error = y-(beta0+beta1*x)
plot(error);hist(error)

  • can alwyas use this “dumb” way to calculate these values (especially if calculations are incredibly complex)

We can use R packages to do most of the work in the future. Most softwares have packages for linear model. In lm function, we have everything we want in the output summary.

# first state modle (y response ~ x, data)
# when there are multiple will see + sign with multiple responses
model1 = lm(Shear_Strength_psi ~ Age_of_Propellant_weeks,data = example21)
summary(model1)
## 
## Call:
## lm(formula = Shear_Strength_psi ~ Age_of_Propellant_weeks, data = example21)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -215.98  -50.68   28.74   66.61  106.76 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             2627.822     44.184   59.48  < 2e-16 ***
## Age_of_Propellant_weeks  -37.154      2.889  -12.86 1.64e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 96.11 on 18 degrees of freedom
## Multiple R-squared:  0.9018, Adjusted R-squared:  0.8964 
## F-statistic: 165.4 on 1 and 18 DF,  p-value: 1.643e-10
  • will use lm() function a lot

  • what we care about in the summary (right now) are the Coefficients

  • for coefficients we have intercept (beta 0) and what we define as out slope (beta 1)

  • response is decreasing with respect to our ???

  • job of statistician is to prove that this slope and intercept are indeed true and not just two random numbers

  • F-test tells us if the whole model works good

For example, we know the variance of slope \(\beta_0\) is \[Var(\hat{\beta_1})=\frac{\sigma^2}{S_{xx}}\]. The test statistic of \(\beta_0\) is \[ t_0 = \frac{\hat{\beta_0}-\beta_{00}}{\sqrt{MS_{Res}(1/n+\bar{x}^2/S_{xx})}}. \] p-value is given by \[P(T>t_0).\] Let’s do it by hand.

sst = sum((y-mean(y))^2);sst
## [1] 1693738
ssres = sum(error^2);ssres
## [1] 166254.9
varBeta1 = (ssres/(n-2))/sxx;sqrt(varBeta1) # variance of \beta_1
## [1] 2.889107
sdOfBeta1 = sqrt(ssres/(n-2)*(1/sxx))
sdOfBeta0 = sqrt(ssres/(n-2)*(1/n+mean(x)^2/sxx))
t0 = (beta0-0)/sdOfBeta0;t0
## [1] 59.47464
pt(-t0,n-2)
## [1] 2.03178e-22
  • this computes the p value (last two columns by hand)

Similarly, lm returns everything else as we want. (We will meet Adjusted R-squared later).

Let \(\alpha = 0.05\), the \((1-\alpha)100\%=95\%\) confidence interval (C.I.) of \(\beta_1\) is \((-43.22 -31.08)\), given by

c(beta1+qt(0.025,n-2)*sdOfBeta1,beta1-qt(0.025,n-2)*sdOfBeta1)
## [1] -43.22338 -31.08380

Prediction

We predict the average response at point \(x_0\) with its corresponding confidence interval. For all \(x_i,i=1,...,n\), we can create a confidence band.

yhat = beta0+beta1*x
plot(x,y,pch=16)
lines(x,yhat)
x0 = 15;yhat0 = beta0+beta1*x0
ciPoint = c(yhat0-qt(0.025,n-2)*sqrt(ssres/(n-2)*(1+1/n+(x0-mean(x))^2/sxx)),
            yhat0+qt(0.025,n-2)*sqrt(ssres/(n-2)*(1+1/n+(x0-mean(x))^2/sxx)))
points(x0,yhat0,pch=2,cex=2);points(c(x0,x0),ciPoint,pch = 8)

yhatSort = beta0+beta1*sort(x)
ciBand = cbind(yhatSort-qt(0.025,n-2)*sqrt(ssres/(n-2)*(1/n+(sort(x)-mean(x))^2/sxx)),
            yhatSort+qt(0.025,n-2)*sqrt(ssres/(n-2)*(1/n+(sort(x)-mean(x))^2/sxx)))
ciPredBand = cbind(yhatSort-qt(0.025,n-2)*sqrt(ssres/(n-2)*(1+1/n+(sort(x)-mean(x))^2/sxx)),
            yhatSort+qt(0.025,n-2)*sqrt(ssres/(n-2)*(1+1/n+(sort(x)-mean(x))^2/sxx)))
plot(x,y,pch=16)
lines(x,yhat,lwd = 2)
lines(sort(x),ciBand[,1],type = 'l',lty = 'dashed',lwd = 2)
lines(sort(x),ciBand[,2],type = 'l',lty = 'dashed',lwd = 2)
lines(sort(x),ciPredBand[,1],type = 'l',lty = 'dotted',lwd = 2)
lines(sort(x),ciPredBand[,2],type = 'l',lty = 'dotted',lwd = 2)

Of course, we have simple way.

confint(model1)
##                              2.5 %    97.5 %
## (Intercept)             2534.99540 2720.6493
## Age_of_Propellant_weeks  -43.22338  -31.0838
newData = data.frame(Age_of_Propellant_weeks = sort(Age_of_Propellant_weeks))
bandOfConf = predict(model1, newdata = newData, interval = 'confidence')# "confidence"
bandOfPred = predict(model1, newdata = newData, interval = 'prediction')# "confidence"
xOrdered = newData$Age_of_Propellant_weeks
plot(x,y)
abline(model1, col = "red")
lines(xOrdered,bandOfConf[,2],type = 'l',lty = 'dashed',lwd = 2)
lines(xOrdered,bandOfConf[,3],type = 'l',lty = 'dashed',lwd = 2)
lines(xOrdered,bandOfPred[,2],type = 'l',lty = 'dotted',lwd = 2)
lines(xOrdered,bandOfPred[,3],type = 'l',lty = 'dotted',lwd = 2)

If we want to estimate the line through the origin, we can do the following