###################################################### #CHERRY TREE EXAMPLE######################################## ###################################################### library(lattice) library(MASS) trellis.device(color=T) cherry.frm <- read.table(file="e:/TempleInland05/cherrydata.dat",header=T, na.strings=c(".")) cherry.frm\$diam <- cherry.frm\$diam/12 # transform diameter to feet so all variables are measured # in the same units cherry.frm[1:3,] # look at the data #plot height versus potential explanatory variables plot(vol~ht,data=cherry.frm,main="Volume vs Height") plot(vol~diam,data=cherry.frm,main="Volume vs Diameter") cherry.lm1 <- lm(vol ~ ht+diam,data=cherry.frm) summary(cherry.lm1) #check residual plots as model diagnostics plot(cherry.lm1,which=1:2) plot(resid(cherry.lm1)~cherry.frm\$ht,main="Residuals vs height") abline(h=0) plot(resid(cherry.lm1)~cherry.frm\$diam,main="Residuals vs diameter") # this plot look 'U' shaped abline(h=0) #Might consider including quadratic term in diameter to improve fit cherry.lm2 <- lm(vol ~ ht+diam+I(diam^2),data=cherry.frm) summary(cherry.lm2) #residual standard error=2.625=sqrt(MSE), R^2=.9771 #check residual plots as model diagnostics plot(cherry.lm2,which=1:2) plot(resid(cherry.lm2)~cherry.frm\$ht,main="Residuals vs height") abline(h=0) plot(resid(cherry.lm2)~cherry.frm\$diam,main="Residuals vs diameter") # this plot looks much better abline(h=0) # testing whether quadratic term in diameter is necessary can # be done in a couple of different, but equivalent, (in the linear model) ways # First, can use anova() function to compare models with, without this term # via F test for nested models: anova(cherry.lm2,cherry.lm1) # Equivalently, t test of whether coefficient on diam^2 equals zero # is given by summary() and coef() functions # This t test statistics is just the parameter estimate divided by its se summary(cherry.lm2)\$coefficients # Note that the square of the t stat is the F stat above, and the # p-vals match. That is, these tests are equivalent. summary(cherry.lm2)\$coefficients[4,3]^2 # Alternatively, might consider regressing transformation of volume on height, # diameter. E.g., cube-root of volume on ht, diameter (since volume is in # cubic feet, others in feet. We don't pursue that idea here. # Another possibility: tree shaped like a cone. Volume of cone given # by V=(pi/12)*h*(d^2). Therefore, log(V)=log(pi/12)+log(h)+2*log(d) cherry.frm\$logV <- log(cherry.frm\$vol) cherry.frm\$logD <- log(cherry.frm\$diam) cherry.frm\$logH <- log(cherry.frm\$ht) cherry.lm3 <- lm(logV~logD+logH,data=cherry.frm) #model diagnostic plots (these all look good) plot(cherry.lm3,which=1:2) plot(resid(cherry.lm3)~cherry.frm\$ht,main="Residuals vs height") abline(h=0) plot(resid(cherry.lm3)~cherry.frm\$diam,main="Residuals vs diameter") # this plot looks good abline(h=0) # Instead of using raw residuals, we might want to use (externally) # studentized residuals, which are approximately standard normal # This can be useful for detecting outliers. The function studres() is # in the MASS package. plot(studres(cherry.lm3)~cherry.frm\$diam, main="Studentized residuals vs diameter") abline(h=0) # The identify() function allows you to interactively identify points on a # plot. identify(studres(cherry.lm3)~cherry.frm\$diam) # no obvious outliers here. summary(cherry.lm3) #R^2=.9777, model also fits very well, resid plots look good, more parsimonious #but difficult to directly compare non-nested models lm2 and lm3 #Here, as often, there is not one definitive model. >1 good model coef(cherry.lm3) #note that here, estimated coefficents on logD, logH are close to theoretical values # of 2, 1 from area of a cone. # Here are 95% confidence intervals on the regression coefficients # computed based on the t distribution (estimate plus/minus t_.975(df_E) * se's ) list(lowerLim95= summary(cherry.lm3)\$coefficients[,1]- qt(.975,summary(cherry.lm3)\$df[2])* summary(cherry.lm3)\$coefficients[,2],estimate=summary(cherry.lm3)\$coefficients[,1], upperLim95= summary(cherry.lm3)\$coefficients[,1]+ qt(.975,summary(cherry.lm3)\$df[2])* summary(cherry.lm3)\$coefficients[,2]) # This is much easier with the function conf.intervals() in the # package alr3 library(alr3) conf.intervals(cherry.lm3,level=.95) # We can see that the theoretical values of 2 and 1 for logD and logH, # respectively, lie inside these intervals.