# factors.R
# An R script illustrating basic ideas of the factor
# class in R, and appropriate uses of factors in
# models fitted in R.
# This script is a companion to the talk,
# Working with Factors in R, given by Dan Hall,
# Director of the SCC as part of the SCC Seminar
# Series on Data Analysis in April, 2021.
# Set the old behavior of stringsAsFactors:
options(stringsAsFactors = TRUE) # Had been the default until R 4.0.0
# Load some packages that we'll need:
library(emmeans) # estimation & inference on marg means
library(car) # Anova(), lht() and other functions
library(codingMatrices) # extends R's built-in tools for contrast matrices
library(forcats) # functions for working with factors
library(ggplot2) # graphics package
library(aods3) # For gof() function used in the logistic reg example
# Examples involving factors for
# SCC Seminar on Factors
# Set RNG seed to make the code here that uses
# random numbers reproducible
set.seed(323329)
sex <- rep(c("M","F"), each=5) # 5 Male, 5 Female
age <- sample(21:80,size=10,replace=TRUE) # random ages between 21 and 80
# bin age into integer age categories
agecat <- .bincode(age,breaks=seq(from=20,to=80,by=10),right=TRUE)
sex;age;agecat # look at what we've created
# Look at their modes:
mode(sex); mode(age); mode(agecat)
# Now create factors from sex and agecat and look at them:
sexFac <- factor(sex); agecatFac <- factor(agecat)
sexFac; agecatFac
# They are unordered factors, as seen below.
class(sexFac); class(agecatFac)
is.ordered(sexFac); is.ordered(agecatFac)
# But what is the mode of these factors?
mode(agecatFac)
is.numeric(agecatFac)
# Huh??
# How about sexFac, whose levels were "M" and "F"?
mode(sexFac) # numeric? Really?
is.numeric(sexFac)
# Put these variables in a data frame:
myData <- data.frame(sex, sexFac,age,agecat,agecatFac)
# check the structure of the data frame
str(myData)
# Wait... wasn't sex a character vector, not a factor? What happened?
# 1. factors are numeric vectors, but the is.numeric() function is programmed
# to return FALSE for them because we usually wish to distinguish an
# ordinary numeric vector from a factor and treat them differently
# 2. All factors are really of mode "numeric", even those whose levels are
# character strings (like sexFac), logicals, etc.
# 3. The elements of factors are their numeric levels (1 for the first level,
# 2 for the second level, etc.). The levels themselves are stored as an
# attribute.
# 4. Up until recently, when a data frame was constructed that includes
# character vectors, the default behavior of data.frame(),
# as.data.frame(), read.table(), and several other functions was to
# coerce character vectors into factors.
# This can be changed with the stringAsFactors= argument (set to FALSE)
# or, when using read.table(), the as.is= argument.
# 5. Note that stringsAsFactors can be set as a global session option. Use
# the code: options(stringsAsFactors = FALSE). The setting
# stringsAsFactors = FALSE is now the default in a factory fresh
# installation of R, but stringsAsFactors = TRUE had been the default
# until recently.
# Now try again to put the variables in a data frame:
myData <- data.frame(sex, sexFac,age,agecat,agecatFac,
stringsAsFactors=FALSE)
# check the structure:
str(myData)
# Recommendation: In data frames with factors, keep two versions of the variable
# one that is a factor, and one that is not.
# Q: How do I coerce a factor back to a character or numeric vector?
# A: Use as.character() to coerce to character, as.numeric(as.character()) to
# coerce numeric levels to a numeric vector
sexFac
(sexCh <- as.character(sexFac))
agecatFac
(agecatNum <- as.numeric(as.character(agecatFac)))
# Can also use the more confusing but (slightly) more efficient method:
# as.numeric(levels(agecatFac))[agecatFac]
# Note: don't simply use as.numeric(). See?:
(agecatNumWrong <- as.numeric(agecatFac)) # gives level numbers, not the levels
# The factor() function will use the alphabetically ordered unique values
# of its argument to create levels, but levels can be set and can be put
# in non-alphabetic order. The order will matter in many uses of the factor.
sexFac
(sexFac.MF <- factor(sex,levels=c("M","F")))
table(sexFac,agecat)
table(sexFac.MF,agecat) # note the different order or rows
# Can also include levels not observed in the data, and can include
# labels. Labels are like formats in SAS. They are printed in place of the levels
# for more informative and aesthetically pleasing output.
agecatFac <- factor(agecat,
levels=1:6,
labels=c("21-30","31-40","41-50","51-60","61-70","71-80"))
agecatFac
sexFac <- factor(sex,levels=c("F","M"),labels=c("Female","Male"))
table(sexFac,agecatFac)
# Recommendation: Get in the habit of using nice labels!
# How do you re-code a factor?
# Lots of useful tools for this in the forcats package.
# Collapsing levels: use fct_collapse():
data.frame(agecatFac,newagecatFac=
fct_collapse(agecatFac,
youngAdult="21-30",
middleAge=c("31-40","41-50","51-60"),
senior=c("61-70","71-80"))
)
# Expanding (adding) levels: use fct_expand()
# Dropping levels: use fct_drop()
# Re-order levels:
# use fct_inorder(), fct_infreq(), fct_inseq(), fct_relevel(),
# fct_reorder(), fct_rev(), or fct_shift().
# Combine levels based on frequency of occurrence:
# use fct_lump_min(), fct_lump_prop(), fct_lump_n(), or
# fct_lump_lowfreq()
# Recode levels: use fct_recode() (can be used to collapse levels)
# Create a factor from combination of levels of two factors:
# use fct_cross()
# Example of fct_recode():
# Again, consider agecatFac:
agecatFac
# fct_recode() similar to fct_collapse() but each existing
# level is re-coded, allowing levels to be collapsed,
# renamed, or dropped
newagecatFac <-
fct_recode(agecatFac,
twenties="21-30",
NULL="31-40",
NULL="41-50",
fifties="51-60",
senior="61-70",
senior="71-80")
levels(newagecatFac)
data.frame(agecatFac,newagecatFac)
# Example of fct_cross():
df <- data.frame(sexFac,
agecatFac,
sexageFac=fct_cross(sexFac,agecatFac))
levels(df$sexageFac)
# By default, empty levels are dropped, but that behavior can be modified:
levels(fct_cross(sexFac,agecatFac,keep_empty=TRUE))
# Note that setting the levels of a factor to be in a particular order
# can be very useful EVEN IF THE FACTOR IS UNORDERED.
# But for ordered factors, the ordering is of course crucial.
# Note that agecatFac is currently an unordered factor, but it may be useful
# for some purposes to which this factor might be put to make it an
# ordered factor:
agecatOrdFac <- factor(agecat,ordered=TRUE,
levels=1:6,
labels=c("21-30","31-40","41-50","51-60","61-70","71-80"))
age
agecatFac; agecatOrdFac
# What is the difference between an ordered and an unordered factor?
# An ordered factor stores not only its levels, but their ordering.
# Comparison operators and functions such as min(), max(), range() can be
# used for ordered factors, but not for unordered ones.
age; agecat
agecatOrdFac[1]>agecatOrdFac[2]
agecatFac[1]>agecatFac[2]
max(agecatOrdFac)
max(agecatFac)
# More importantly, it is often appropriate and useful to parameterize the
# effects of ordered factors differently than the effects of unordered
# factors.
# Suppose we only had agecat, and not the exact ages of subjects and we
# wished to fit a model to compare the mean response at each age category.
# And also suppose we had a bigger study, with subjects in every one of the
# 6 age categories.
# Let's keep it simple and suppose we have a continuous response and wish to
# fit a one-way anova model (although the parameterization issues here apply
# to other models with linear predictors such a logistic regression models and
# other GLMs). The model would be
# y_ij=mu + alpha_i + e_ij (M1-e, effects version of the model)
# or, equivalently,
# y_ij=mu_i + + e_ij (M1-m, cell means version of the model)
# where i=1,...,6 indexes the age categories, and j=1,...,n_i indexes
# observations (replicates in a designed expt) within each age category.
# Note that these models are equivalent, but model M1-e is "overparameterized"
# Each model fits 6 means, but model M1-e uses 7 parameters to do it
# (one too many). This overparameterization makes the model not
# identifiable and the 7 parameters not uniquely estimable.
# It turns out that the model can be fitted without resolving (removing) the
# overparameterization (and SAS does this in procs such as PROC GLM),
# but R never does so. In R, the overparameterization is removed by adding
# constraints to the effects for the factor effects (the alpha_i's)
# before fitting the model. This is done through the use of "contrasts"
# applied to the age category factor (agecatFac).
# Such "identifying constraints" are not unique; there are many different
# ones that could be applied to remove the non-identifiability, and each one
# yields an equivalent model, parameterized in a particular way.
# Sum-to-zero constraints: alpha_1+alpha_2+...+alpha_6=0
# Set-one-level-to-zero constraints: alpha_i=0 for one specific choice of i
# e.g., alpha_1=0 (first level baseline)
# e.g., alpha_6=0 (last level baseline)
# These constraints can be imposed on the model via what R calls
# "contrasts", or "contrast matrices". In fact, the name is a bit
# misleading and a better name would be "coding matrices" (as suggested
# by Venables, 2018), but we will use the traditional R terminology.
# For a factor with p levels, a contrast matrix B is a p-1 x p matrix
# such that [1,B] is non-singular (invertible). It can be used to
# reparameterize the one-way cell means model involving the factor
# to an equivalent model with an intercept. In some cases, the
# reparametrized model corresponds to the effects model with side
# conditions, but contrast matrices are more general tools for model
# reparameterization and not all models that have been reparamerized
# with contrasts matrices have an obvious correspondence to a set of
# side conditions imposed on the effects model.
# If the cell means model has mean vector X*mu, where mu is a vector
# mu=(mu_1,...,mu_p)', then a contrast matrix allows a
# reparametrerization from mu to beta where mu=[1,B]*beta and our
# model mean becomes X*[1,B]*beta or Xtilde*beta where Xtilde=X*[1,B]
# It is not hard to see that Xtilde*beta has the form
# 1*beta_1 + X*B*(beta_2,...,beta_p)' so that the reparametrized model
# has an intercept and the null hypothesis of equal means:
# H_0: mu_1=...=mu_p holds iff beta_2=...=beta_p=0
# If we are just interested in this null hypothesis, then the choice
# of contrast matrix does not matter: all give the same test. But
# each contrast matrix gives a different parameterization of the model
# under which the elements of beta have different interpretations.
# So for some purposes, one choice of contrast matrix may be more
# convenient than another. Moreover, in models involving interactions
# between two factors and main effects of each factor, the tests
# (in particular the so-called "Type III" tests) can depend on the
# contrast matrices chosen for each factor. So the choice of
# contrasts for factors in R is important and is something to be
# understood by the user of the model.
# Contrast functions in R:
# Contrasts matrices are applied to factors in R whenever the factor
# is used in the model. There is a system option that specifies the
# contrasts to be used by default for both unordered and ordered
# factors.
# Here, we query that option to see that the default contrast type
# for unordered factors is contr.treatment, and the default for
# ordered factors is contr.poly.
options("contrasts")
# We can use these defaults, or we can change the defaults in our
# current R sesssion by modifying the "contrasts" session option via
# a command like this:
op <- options(contrasts=c("contr.SAS","contr.helmert"))
# the previous statement resets the defaults for the remainder of the
# session and saves the previous value of the "contrasts" option in an
# object called op (any name could be used). Below we check that the
# change we made took effect, and then reset back to the original
# setting by using our op object.
options("contrasts")
options(op)
options("contrasts")
# Alternatively, a specific choice of contrasts can be assigned to
# an individual factor with the contrasts() function.
# Here we query the contrasts for our sexFac, agecatFac, and
# agecatOrdFac contrasts and see that they match those in the
# session "contrasts" option, but then assign sum-to-zero contrasts
# for sexFac.
contrasts(sexFac) #contr.treatment contrast matrix for a 2-level factor
contrasts(agecatFac) #contr.treatment contrast matrix for a 6-level factor
contrasts(agecatOrdFac) #contr.poly contrast matrix for a 6-level ordered factor
contrasts(sexFac) <- contr.sum(2) # contrast matrix for sum to zero contrasts
contrasts(sexFac)
# Contrast functions in R:
# These functions correspond to traditional "side conditions" in anova
# models with effects parameterizations:
# contr.treatment(p) - Sets first level effect to zero.
# Parameters are differences b/w each level and
# the mean at the first of p levels.
# contr.treatment(p,base=b) - Sets bth level effect to zero.
# Parameters are differences b/w each level and
# the mean at the bth of p levels.
# contr.SAS(p) - Same as contr.treatment(p,base=p).
# Sets last level effect to zero.
# Parameters are differences b/w each level and
# the mean at the last of p levels.
# contr.sum(p) - Imposes sum-to-zero constraints on effects.
# Parameters are differences b/w each level and
# the (equally weighted) average of the means at
# each of the p levels.
# Note that with contr.sum(), the intercept in the model is the average
# of the means at each level of the factor, not the grand average of
# all the data (these will differ when the design is unbalanced). This
# is demonstrated below.
junkFac <- factor(c(rep("M",4),rep("F",12))) # an unbalanced factor
y <- rnorm(16) # some fake response data to work with
contrasts(junkFac) <- contr.sum(2) # set sum-to-zero contrasts
m1 <- lm(y~junkFac) # fit the model
coef(m1) # examine coefficients
grp.means <- tapply(y,junkFac,mean) # compute means at each factor level
mean(grp.means) # mean of the means
grp.means-mean(grp.means) # coefs are deviations from mean of means
mean(y) # grand average of data is different
# There are several contrast functions that parameterize the model
# in terms of contrasts across the levels of a factor that are
# traditionally used for the purpose of "mean separation"
# (aka post hoc testing) after
# deciding that the means across the levels are not all equal
# (that is, after rejecting the hypothesis of equal means in a one-way
# anova).
# These include:
# contr.treatment(p,base=b) - Discussed above.
# p=number of levels, b=baseline level
# contr.helmert(p) - Constructs orthogonal contrasts between
# level 1 vs level 2
# levels 1 & 2 vs level 3
# levels 1,2 & 3 vs level 4, etc.
# contr.poly(p) - Constructs orthogonal polynomial contrasts among the
# p levels of the factor. By default, levels are assumed
# equally spaced. But if not, the scores argument can
# be used for non-equally spaced levels. This function
# only makes sense for ordered factors with levels that
# can be assigned quantitative scores.
# E.g., in a one-way anova model y_ij=mu_i+e_ij we conduct the
# main effect F test to test H_0:mu_1=...=mu_p. If we reject H_0,
# then we wish to know which means differ from which others.
# Sometimes this is done by making all pairwise comparisons, but that
# is a shotgun blast approach. Often there are fewer comparisons that
# are of interest corresponding to certain a priori (or planned)
# hypotheses of interest.
# E.g., if treatment 1 is a control treatment, we may be content to look
# only at pairwise comparisons with the control:
# mu_1 - mu_2, mu_1 - mu_3, ..., mu_1 - mu_p.
# This can be done by fitting the model in whatever parameterization
# we like and then testing these pairwise contrasts based on the fitted
# model.
############################
# Walking Infants Example #
############################
# An example. An experiment where infants were randomized to four
# treatments (control, no exercise, passive exercise, active exercise)
# to stimulate and exercise a (pseudo) walking response in newborns.
# Response is the age (in months) when child first began to walk.
# Here, group is the treatment factor.
options(stringsAsFactors=FALSE) # Reset to current default as of R 4.0.0
# read the data in, setting group variable to be a factor:
walkdata <- read.table(file="https://faculty.franklin.uga.edu/dhall/sites/faculty.franklin.uga.edu.dhall/files/walking.dat",
header=TRUE,colClasses=c("numeric","factor"))
head(walkdata)
is.factor(walkdata$group)
levels(walkdata$group) # notice the levels are put in alphabetical order
# Fit a one-way anova model in the default parameterization:
walk.m1 <- aov(age~group,data=walkdata)
class(walk.m1) # notice this object has two classes: "aov" and "lm"
summary(walk.m1) # the default summary for an aov object
summary.lm(walk.m1) # can also get the summary when treating as an
# lm object, which includes coefficients
# In the parameterization of walk.m1, the intercept is the
# estimated mean for the reference category (the active treatment)
# and other parameters are differences between the means in
# other treatments and the active treatment.
# If we wish we can fit the model in another parameterization.
# That does not prevent us from doing any inference on the treatment
# means that is desired, but it does change the coefficients of the
# model and their interpretation.
contrasts(walkdata$group) <- contr.sum(4)
walk.m1a <- aov(age~group,data=walkdata)
summary(walk.m1a) # same ANOVA table as for walk.m1
summary.lm(walk.m1a)$coefficients # different parameters than walk.m1
# In the parameterization of walk.m1a, the intercept is the
# grand mean of all treatment means and
# and other parameters are differences between the means in
# the first 3 treatment and the grand mean.
# Fitting the model in a different parameterization does not prevent
# us from doing inference on any (estimable) quantity of interest
# from the model. Here we estimate the treatment group means from
# two different parameterizations of the same model and, of course, the
# results agree:
(walk.m1.emm <- emmeans(walk.m1,specs= ~ group))
(walk.m1a.emm <- emmeans(walk.m1a,specs= ~ group))
# The emmeans() function (from the emmeans package) used above yields
# estimates of the treatment means. We can feed those into the
# contrast() function (also from the emmeans package and different
# from the contrasts() function that specifies contrast matrices for
# factors) to do inference on contrasts among the means. Here is
# such an example:
contrast(walk.m1.emm,"trt.vs.ctrl",ref=2) # Dunnett's comparisons with ctrl
# Alternatively, we can parameterize the model in terms of the contrasts
# in group means that we are interested in. Then the model parameters
# themselves estimate the contrasts of interest and no extra step to
# estimate those contrasts is necessary.
# E.g., we can parameterize the model in terms of an intercept and
# pairwise deviations from the control treatment which, for the
# group factor in walkdata, is level 2:
contrasts(walkdata$group) <- contr.treatment(levels(walkdata$group),
base=2)
walk.m1b <- aov(age~group,data=walkdata)
summary(walk.m1b) # same ANOVA table as for walk.m1
summary.lm(walk.m1b) # different parameters than walk.m1
# which now are the estimates of the pairwise
# contrasts with the control mean
# As a second example, we can parameterize the model in terms
# of an intercept and pairwise differences in adjacent category
# means of an ordered version of group:
# Make the ordered version of group (here the consecutive levels of
# the group factor represent increasing intervention to stimulate the
# walking movements in the infant)
walkdata$groupOrd <- factor(walkdata$group, ordered=TRUE,
levels=c("control","noex","passive","active"))
# The code_diff() function is from codingMatrices package:
contrasts(walkdata$groupOrd) <- code_diff(4)
# Fit the model with the new parameterization, which does not change
# the ANOVA table or test of main effect:
walk.m1c <- aov(age~groupOrd,data=walkdata)
summary(walk.m1c) # same ANOVA table as for walk.m1
# But now the coefficients (other than the intercepts are estimates
# of mu2-mu1, mu3-mu2, mu4-mu3:
summary.lm(walk.m1c)$coefficients
# The same contrasts in treatment means can be obtained by getting
# the 4 estimated marginal group means and then taking pairwise
# differences between consecutive means via the contrast() function
# of the emmeans package using contrast type "consec":
(walk.m1c.emm <- emmeans(walk.m1c,specs= ~ groupOrd))
contrast(walk.m1c.emm,"consec")
# As a third example, we can parameterize the model in terms
# of an intercept, linear, quadratic and cubic effects of the
# intervention. Generally speaking, this makes sense for quantitative
# factors (e.g., dose level: 0, 50mg, 100mg, 150mg), but I don't want
# to introduce a new example of that kind here so we illustrate
# with the walking experiment. Here, fitting linear, quadratic and cubic
# effects assumes the ordered levels control, noex, passive and active
# are equally spaced with respect to some underlying quantitative
# "motor stimulation" variable. Another way to think about these
# effects is as linear, quadratic, cubic effects in a "scored" version
# of groupOrd where we attached scores 1,2,3,4 (or, equivalently, any
# equally spaced numbers) to the levels control, noex, passive, active.
# Such a parameterization corresponds to a set or orthogonal
# polynomial contrasts among the means and is implemented in
# contr.poly() and code_poly().
contrasts(walkdata$groupOrd) <- contr.poly(4)
# Fit the model with the new parameterization, which does not change
# the ANOVA table or test of main effect:
walk.m1d <- aov(age~groupOrd,data=walkdata)
summary(walk.m1d) # same ANOVA table as for walk.m1
# But now the coefficients (other than the intercepts are estimates
# of linear, quadratic and cubic effects of motor stimulation:
summary.lm(walk.m1d)$coefficients # linear effect signif, others not
# The same contrasts in treatment means can be tested by getting
# the 4 estimated marginal group means and then estimating orthog
# polynomial contrasts in them using the emmeans package:
contrast(walk.m1c.emm,"poly")
# Note that point estimates of the polynomial contrasts differ
# because they are scaled differently in the two implementations, but
# that does not affect the tests, which agree, and which are what we
# usually care about in this context.
# By default, contr.poly() assumes evenly spaced factor levels
# but if we have a factor with non-evenly spaced levels
# e.g., dose=0mg, 100mg, 250mg, 400mg, this can be accommodated
# via the scores= argument. E.g., here we generate orthog polynomial
# contrast matrix for our dose example:
contr.poly(4,scores=c(0,100,250,400)) # lin coefs not evenly spaced now.
# Before we leave the walking example, note that we have considered
# several parameterizations of the one-way anova model for these data,
# all of which have an intercept (or constant term) and all of which
# yield an ANOVA F test of equal means from the ANOVA table given by
# the model summary.
summary(walk.m1) # default 1st level set to zero param
summary(walk.m1a)# effects model w/ sum-to-zero constraints
summary(walk.m1b)# 2nd level effect set to zero (contrasts w/ control)
summary(walk.m1c)# pairwise contrasts in consecutive levels
summary(walk.m1d)# param in terms of orthogonal polynomials
# But we have not used the cell means version of the model. Here it is:
walk.m1e <- aov(age~0+group,data=walkdata)
# Note formula can also be as shown here:
# walk.m1e <- aov(age~group-1,data=walkdata)
summary.lm(walk.m1e) # summary showing coefficients
# Notice that the coefficients are now the estimated group means,
# same as we computed from, for example, model walk.m1:
(walk.m1.emm <- emmeans(walk.m1,specs= ~ group))
# But notice that most of the rest of the summary for model walk.m1e
# disagrees with results from other models.
# 1. walk.m1e: R-squared=.9855, Adjusted R-squared=0.9825
# walk.m1 : R-squared=.2538, Adjusted R-squared=0.1348
# Default method of computing R-square stats in linear models
# assumes there is an intercept and is incorrect otherwise.
# 2. walk.m1e: ANOVA F-statistic=323.6 on 4,19 df
# walk.m1 : ANOVA F-statistic=2.142 on 3,19 df
# This difference can also be seen in the default summaries of the
# two models, which shows the ANOVA table:
summary(walk.m1e)
summary(walk.m1)
# What is going on here?
# For walk.m1 (model with intercept), SS_group quantifies
# variability around the grand mean. F test tests hypothesis
# that that variability is zero <--> all means are equal.
# For walk.m1e (model without intercept), SS_group quantifies
# variability around zero. F test tests hypothesis
# that that variability is zero <--> all means are equal to 0!
# The difference in the hypotheses being tested should be
# apparent from the df:
# 3 df to test mu1=mu2=mu3=mu4.
# 4 df to test mu1=mu2=mu3=mu4=0.
# The latter hypothesis is almost never of interest, so "overall
# F test" from cell means model should typically be ignored.
# 3. The parameter estimates, SEs, and t-test differ.
# By now, it should be no surprise that they differ. All of these
# different model parameterizations have parameters with different
# interpretations and therefore different estimates (etc.). This
# should make it clear that the parameter estimates and/or
# associated inferences reported for an anova model are not
# necessarily quantities of interest In the case of the cell means
# model, walk.m1e, the parameter estimates are estimated treatment
# means. The t test associated with any one of those estimates test
# the hypothesis that the mean walking age in that treatment is 0
# weeks---NOT an interesting hypothesis to test! Similarly, the
# parameter estimates and inferences in any other parameterization
# are not necessarily of interest. That is why, for ANOVA models,
# the default summary output (true of the summary() function in R
# and in other software like PROC ANOVA or PROC GLM in SAS) for
# ANOVA models does not show parameter estimates, SEs and t tests
# as it would for a regression model.
# If the parameter estimates from an ANOVA model are not necessarily
# of interest, what is?
# Typically, the starting place in a one-way ANOVA model is the F
# test of equal means across the grouping (e.g., treatment) factor.
# Then the most useful and appropriate inferences depend on the
# questions of interest and the goals of the analysis.
# Often, we will want to estimate treatment means and put confidence
# intervals around them.
# Sometimes, we will wish to make pairwise comparisons between
# all treatment means, or between a control or standard treatment mean
# and each other treatment mean. These comparisons could involve
# tests of significance and/or point estimates and confidence intervals
# for pairwise differences in means.
# Sometimes there will be a set of specific contrasts among the means
# to be estimated or tested (which may not take the form of pairwise
# differences).
# In most of these scenarios, there is a set of several follow-up
# inferences---an example of a "family" of inferences---after the
# overall main effect F test. Typically, it is
# viewed as appropriate to control a combined Type I error rate for
# this family if it is a set of tests, or to combine the combined
# coverage probability if it is a family of confidence intervals.
# How do we do this?
# First, after having selected a model and verified that it is
# appropriate, our inferences should be based on our model. So
# our inferences on the means, for example, should be based on
# our model-based estimates of the means. In particular, it does not
# make sense, and in some settings can be quite a bad idea, to go
# back to square one and just estimate treatment means with sample
# means of the data in each treatment, ignoring our model. Such means
# are often called raw means. In simple
# contexts the model-based estimates of treatment means are the raw
# treatment means, but that is not always true(!)
# and, when it is not true, using the raw means can be a very
# poor basis of inference.
# Model-based estimates and inferences on means can be done with the
# emmeans package. This package has extensive capabilities and can be
# complex and confusing to use. But it is also well documented.
# (The emmeans package used to be called lsmeans. The term LSMEANS is
# used in SAS to refer to model-based estimates of means. In classical
# linear models that are fitted using least squares, then these would
# be least-squares estimates of the means (hence the term, lsmeans).
# But SAS uses the term LSMEANS even in PROCs that fit models with
# maximum likelihood and other methods and a more general term is
# sensible. Hence, the R package lsmeans, originally designed to give
# results like those available via the LSMEANS statement in SAS,
# was renamed emmeans a few years ago. The term emmeans is short for
# estimated marginal means.)
# We used the emmeans() function several times above to get estimates
# of group means in a one-way ANOVA model and the contrasts()
# function to get inferences on a family of contrasts among those means.
# E.g., again, here are the estimated means for each treatment:
(walk.m1.emm <- emmeans(walk.m1,specs= ~ group))
# which are the same point estimates as the parameters in the cell
# means model:
coef(walk.m1e)
# And here are Bonferroni simultaneous confidence intervals for the
# four treatment means:
confint(walk.m1.emm,adjust="bonferroni")
# And unadjusted confidence intervals:
confint(walk.m1.emm,adjust="none")
# Or, if we are interested in pairwise differences among all treatment
# means, we might do Tukey's HSD tests:
contrast(walk.m1.emm,method="pairwise",
adjust="tukey") # Tukey HSD tests for all pairs of means
contrast(walk.m1.emm,
method="pairwise") # Tukey adjustment is default for "pairwise" method
# The summary function applied to an emmGrid object (e.g., the class
# of object returned by emmeans() that contains a set of estimated
# marginal means for each level of a factor or contrast() that
# returns a set of contrasts among means) can be used to output
# tests and/or confidence intervals with simultaneous inference
# adjustments. The infer= argument controls which types of inferences
# you are requesting (tests, intervals, both or neither).
# Bonferroni intervals for each mean but not tests of each mean=0
# because those are not interesting here:
summary(walk.m1.emm,infer=c(TRUE,FALSE),adjust="bonferroni")
# Tukey HSD intervals and tests for each pairwise diff among means :
summary(contrast(walk.m1.emm,
method="pairwise"),
infer=c(TRUE,TRUE),adjust="tukey")
# The inferences above are for means at each level of the single
# group (or treatment) factor involved in the one-way ANOVA model we
# fitted to the data from the walking experiment.
# The concept of marginal means arises when fitting multi-factor
# models.
############################
# Soybean Weeds Example #
############################
# Next we consider data from a randomized complete block
# design with a single blocking factor (location) and two
# crossed treatment factors: soybean variety (16 of them) and
# method of herbicide use (none, applied after 2 weeks, applied
# after 4 weeks). The response was weed biomass.
# In each location, the 16*3=48 treatments were applied to
# one plot each.
weedDat <- within(read.table(file="https://faculty.franklin.uga.edu/dhall/sites/faculty.franklin.uga.edu.dhall/files/inline-files/weeds.dat",
header=TRUE,stringsAsFactors=FALSE,
colClasses=c("character","numeric","numeric",
"character")),
{
varFac <- factor(variety)
herbFac <- factor(herb,levels=as.character(c(0,2,4)),
labels=c("none",
"2 weeks",
"4 weeks"))
locFac <- factor(loc,levels=c("R","S"),
labels=c("Rosemount","St.Paul"))
})
head(weedDat,3)
# A suitable model is one with an effect for the blocking
# factor, and then main effects and an interaction for the
# two treatment factors.
weeds.m1 <- aov(weeds~locFac+varFac*herbFac,data=weedDat)
summary(weeds.m1)
# We are potentially interested in several types of means here:
# Joint means are means at combinations of experimental factors.
# Here we might be interested in the mean response in each of
# the 48 treatments. These would be joint means at the combos
# of variety and herbicide.
# Marginal means. These are means at the level of one factor,
# averaged over the levels of another. Or, in the case of
# 3 factors, we might define marginal means at each level of
# factor A averaged over the levels of factors B and C, or
# we might be interested in the marginal means at combinations
# of A and B, averaged over the levels of factor C.
# All of these are marginal means.
# In our example, there is extremely little evidence of
# interaction and strong evidence of an effect of herbicide.
# So we might be interested in estimating the marginal mean
# for herbicide, averaged over variety. Note that we cannot
# assess interactions between the blocking factor and the treatment
# factors here, but under the assumption that such interactions
# are absent, it is natural to average over location too when
# estimating a marginal mean for herbicide.
# Here are the joint means for each treatment, the marginal means
# for variety and the marginal means for herbicide. Note that all
# all of these means are marginal with respect to location too---that
# is, they are averaged over the two locations.
(weeds.m1.emm <- emmeans(weeds.m1,
specs= list(jointMeans= ~ varFac:herbFac,
varietyMargMeans= ~ varFac,
herbMargMeans= ~ herbFac)))
# We can examine the nature of the non-significant two way interaction
# here with an interaction plot (aka a profile plot) of the joint
# means.
emmip(weeds.m1,herbFac~varFac) +
theme(legend.position="top",
axis.text.x = element_text(hjust=0,angle = -45))
# Given that the interaction is not (nearly) significant, it is sensible
# to assess the main effects and do inferences on the marginal means
# for each factor. Since the variety main effect is not close to
# significant (p=.173), and the herbicide effect is (p<.0001), it is
# appropriate to do inference on the marginal herbicide means.
# Unadjusted CIs for each herbicide marginal mean:
plot(weeds.m1.emm$herbMargMeans,horizontal=FALSE) +
ggtitle("Estimated marginal means for herbicide\nwith 95% CIs")
# Bonferroni-adjusted CIs for each herbicide marginal mean:
plot(emmeans(weeds.m1,specs= list(herbMargMeans= ~ herbFac),
adjust="bonferroni"),horizontal=FALSE) +
ggtitle("Estimated marginal means for herbicide\nwith Bonferroni simultaneous 95% CIs")
# Tukey HSD adjusted confidence intervals for each pairwise difference
# in marginal herbicide means:
plot(contrast(weeds.m1.emm$herbMargMean,method="pairwise"),
horizontal=FALSE) + geom_vline(xintercept=0,color="red",linetype=2)
# Must use geom_vline() (instead of geom_hline()) above because
# plot is built horizontally and then rotated because of the
# horizontal=FALSE option.
# "simple effects" vs "main effects"
# In a model with multiple treatment factors and interactions, we usually
# refer to the test of significance of a factor A that is involved in an
# interaction A*B as the main effect of A, or the test of main effects of
# A. In addition, you may also have heard of the simple effect of A
# A simple effect, is an effect of a treatment factor at a given level
# of the other factor (or factors). E.g., let's make our weed example
# a bit simpler and pretend there were only two herbicide treatments:
# no herbicide, and herbicide applied at 2 weeks.
# Let's refit our same model with block effects, effects of each
# treatment factor (herbFac and varFac) and the interaction b/w
# herbFac & varFac:
weeds.m2 <- aov(weeds~locFac+varFac*herbFac,data=weedDat,
subset= as.character(herbFac) != "4 weeks")
summary(weeds.m2)
# The interaction is still not nearly significant.Nevertheless,here is
# the interaction plot:
emmip(weeds.m2,herbFac~varFac) +
theme(legend.position="top",axis.text.x = element_text(hjust=0,angle = -45))
# The simple effects of herbicide are the effects of "none"
# and "2 weeks" within each variety.
# By "effects", I mean differences from the grand mean.
# The main effects of herbicide are the averages of the simple
# effects over the 16 varieties.
# Let's see the distinction graphically.
# Compute the overall, marginal, and joint means:
weeds.m2.emm <- emmeans(weeds.m2,
specs= list(grandMean= ~1,
jointMeans= ~ varFac:herbFac,
varietyMargMeans= ~ varFac,
herbMargMeans= ~ herbFac))
# Now re-plot the interaction plot and show the simple effects of
# herbicide for one of the varieties:
df <- as.data.frame(weeds.m2.emm$jointMeans)
df1 <- data.frame(x1=2,
y1=df[as.character(df$varFac)=="Lambert",]$emmean[1],
x2=2,
y2=df[as.character(df$varFac)=="Lambert",]$emmean[2],
x3=2,
y3=as.data.frame(weeds.m2.emm$varietyMargMeans)$emmean[2])
df2 <- as.data.frame(weeds.m2.emm$varietyMargMeans)[2,]
df2$herbFac <- " Lambert marg'l mean"
emmip(weeds.m2,herbFac~varFac,xlab="Variety",ylab="Weed biomass") +
theme(legend.position="top",
axis.text.x = element_text(hjust=0,angle = -45),
plot.title=element_text(h=.5,size=11)) +
geom_hline(aes(yintercept=emmean,color=herbFac),
data=df2,
show.legend=TRUE) +
geom_segment(aes(x=x1,y=y1,xend=x3,yend=y3+50),data=df1,
arrow=arrow(ends="both",length=unit(0.1, "inches")),
inherit.aes=FALSE)+
geom_segment(aes(x=x2,y=y2,xend=x3,yend=y3-50),data=df1,
arrow=arrow(ends="both",length=unit(0.1, "inches")),
inherit.aes=FALSE)+
labs(title="Simple effects of herbicide for variety Lambert",
color="") +
guides(color = guide_legend(override.aes = list(shape = "")))
# In contrast, here are the main effects of herbFac:
df <- rbind(as.data.frame(weeds.m2.emm$herbMargMeans)[,-1],
as.data.frame(weeds.m2.emm$grandMean)[,-1])
df$Mean <- c("none","2 weeks"," grand mean")
df1 <- data.frame(x1=2,y1=df$emmean[1],x2=2,y2=df$emmean[2],
x3=2,
y3=as.data.frame(weeds.m2.emm$grandMean)$emmean[1])
emmip(weeds.m2,herbFac~varFac,xlab="Variety",ylab="Weed biomass") +
theme(legend.position="top",
axis.text.x = element_text(hjust=0,angle = -45),
plot.title=element_text(h=.5,size=11)) +
geom_hline(aes(yintercept=emmean,color=Mean),
data=df) +
geom_segment(aes(x=x1,y=y1,xend=x3,yend=y3+50),data=df1,
arrow=arrow(ends="both",length=unit(0.1, "inches")),
inherit.aes=FALSE) +
geom_segment(aes(x=x2,y=y2,xend=x3,yend=y3-50),data=df1,
arrow=arrow(ends="both",length=unit(0.1, "inches")),
inherit.aes=FALSE) +
labs(title="Main effects of herbicide",
color="Means:") +
guides(color = guide_legend(override.aes = list(shape = "")))
# In the presence of interaction, the simple effects of one
# factor differ significantly across the levels of the
# other factor. E.g., the effect of herbicide might be much greater for
# some varieties than others (a quantitative interaction) or herbicide
# could even have a helpful effect for some varieties and a harmful
# effect for others (a qualitative interaction). In such cases
# (especially qualitative interactions), main effects test and
# comparisons of marginal means can be highly misleading and should be
# avoided. Instead, it is more appropriate to compare joint means
# across the levels of one factor within each level of the other
# factor. And if we prefer a test, it is appropriate to test whether
# simple effects of one factor are zero within each level of the other
# factor. Such tests are sometimes called tests of effect slices.
# Here are estimates of each simple effect of herbicide within
# each variety and tests of whether those effects are 0. These
# are tests that the effect of an herbicide level is significantly
# different from 0, the average effect. Such tests are usually not
# of much interest, but I wanted to show what the simple effects are.
# Note that by default, emmeans applies an FDR (false discovery rate)
# correction for each family of 3 simple effect tests.
contrast(weeds.m1.emm$jointMeans,simple="herbFac")
# Below are tests of the hypothesis that the three simple effects are
# all zero for each variety. These are tests that the joint means
# are the same across the 3 herbicide treatments for each variety
# (16 2-df tests of equal means across the 3 levels of herbicide).
ts <- test(contrast(weeds.m1.emm$jointMeans,simple="herbFac"),joint=TRUE)
ts
# The note that "df1 reduced due to linear dependence" is because
# there are three simple effects of herbicide for each variety,
# but one is redundant (can be determined from the others) so there
# are only 2 df for the test that these 3 effects are all equal to
# zero. This not is not an indictor that there's anything wrong
# here.
# In the presence of interaction, conducting the tests of effect slices
# given above would be an
# appropriate alternative to the test of main effects for herbicide.
# However, when we take that alternative, we replace a singe 2-df test
# by 16 tests! Doesn't that inflate our Type I error rate?
# Yes! So it is a good idea to apply some multiplicity adjustment.
# Here we add adjusted p-values adjusted by Holm's method, which
# controls the FWER (familywise error rate) and the Benjamini-Hochberg
# methods, which controls the FDR (false discovery rate). There is no
# definitely correct multiplicity adjustment method to use, in part
# because the choice depends upon one's risk tolerance.
ts$fdr_adjpval <- format.pval(p.adjust(ts$p.value,method="fdr"),digits=3,eps=.0001)
ts$holm_adjpval <- format.pval(p.adjust(ts$p.value,method="holm"),digits=3,eps=.0001)
ts
# Type I, II, III tests
# When we have crossed factors we typically fit models with main
# effects and interactions so that we can assess whether the effect of
# one factor differs across any of the levels of the other factor
# (interaction) and the effects of each factor in isolation.
# When the factors interact, it may not be meaningful or appropriate
# to examine main effects. But when they do not interaction
# (and even in some cases in which the interaction is orderly), we
# often will wish to assess main effects. How do we assess main
# effects in models with interactions?
# There are three main approaches, known as Type I, II, and III tests.
# These all give the same answer in balanced designs, but disagree
# when the data are unbalanced. So whats' the difference between these
# methods and which one should we use?
# Type I tests are sequential tests. They rarely test interesting
# hypotheses on main effects in the presence of interaciton effects.
# Type I tests depend on the order in which the model effects are
# entered into the model, which is determined by the syntax that we
# use in our software. This is very unappealing.
# Type II tests for main effects assess a factor's main effect after
# controlling for all other main effects and all interactions that
# do not involve the effect being assessed. Such tests are not
# depending on the order in which the effects are specified in the
# modeling software. But they test hypotheses on weighted marginal
# means (marginal means for a factor where the averaging over
# the other factor that is done, is done in a weighted manner), where
# the weights are based on the sample sizes in each treatment.
# There are some applications where these hypotheses might be of
# interest, but when the levels of a factor are of equal relevance
# to the population on which we are making inference, these
# hypotheses are not what one would want to test.
# Type III tests (sometimes called marginal tests) test the main
# effect of a factor after adjusting for all other terms in the
# model. For a main effect of a factor (factor A, say), this tests
# that the (equally weighted) marginal means for factor A are all
# the same. This is usually the hypothesis of interest, particularly
# for experimental data, but also for many applications to
# observational studies.
# In R, the anova() and summary() functions give Type I tests.
# To get Type II and Type III tests, use Anova() function form the
# car package.
# But in order for the Type III tests to be correct, it is essential
# that the model be parameterized with sum-to-zero constraints on
# the factor levels.
# To illustrate, lets consider the Soybean Weeds Example, but we'll
# have to make it unbalanced, first.
# Suppose varieties 3-16 were used in Rosemount and varieties 1-14
# were used in St.Paul. This gives an unbalanced design where
# a different set of treatments were observed in each block.
# Below we modify the data set to match this unbalanced description.
vs <- unique(weedDat$variety) # the 16 varieties
inds <- with(weedDat,(loc=="R" & variety %in% vs[-c(1:2)]) |
(loc=="S" & variety %in% vs[-c(15:16)]) )
weedDat.un <- weedDat[inds,] # the unbalanced data set
# Now fit the model to the unbalanced data. All 3 models below are
# equivalent.
# Remember, we must use sum-to-zero constraints for Type III
# tests, so the 3rd model below, is parameterized correctly for
# obtaining those tests.
weeds.m1.un <- aov(weeds~locFac+varFac*herbFac,data=weedDat.un)
weeds.m1a.un <- aov(weeds~varFac*herbFac+locFac,data=weedDat.un)
weeds.m1b.un <- aov(weeds~locFac+varFac*herbFac,data=weedDat.un,contrasts=list(locFac=contr.sum,varFac=contr.sum,herbFac=contr.sum))
# Type I tests depend on the order of the terms in the formula
# within the aov() function (not good!).
anova(weeds.m1.un); anova(weeds.m1a.un)
# Type III tests:
Anova(weeds.m1b.un,type=3) # RIGHT
# Type III tests with wrong type of contrasts give garbage results,
# but no warnings, so must know what you are doing!:
Anova(weeds.m1.un,type=3) # WRONG!
# Factors and Marginal Means in Logistic Regression.
####################################################
# A logistic regression example---Tobacco Budworms #
####################################################
# This example is a classic example for logistic regression
# from a toxicology experiment. Increasing doses of cypermethrin
# were given to male and female tobacco budworms. 6 groups of 20
# male budworms were given increasing doses (1,2,4,8,16,32)
# and 6 groups of 20 female budworms were given the same set of
# 6 doses.
# The data are read into R below and tabulated and plotted.
# get the data:
budworm <- read.table(header=TRUE, text="
sex dose y m
1 1 1 20
1 2 4 20
1 4 9 20
1 8 13 20
1 16 18 20
1 32 20 20
2 1 0 20
2 2 2 20
2 4 6 20
2 8 10 20
2 16 12 20
2 32 16 20
")
# Do some data manipulation so that I can tabulate and plot the data
# more easily:
male <- budworm$sex==1
budworm <- within(budworm,{
male <- as.numeric(sex==1)
sexFac <- factor(sex,levels=c(1,2),labels=c("Male","Female"))
doseFac <- factor(dose)
p.dead <- y/m
ldose <- log2(dose)
dead <- y
alive <- m-y
samplogit <- log((dead+.5)/(alive+.5))
})
# bud2 below is an ungrouped version of the same data set:
bud2 <- rbind(budworm[rep(1:12,times=budworm$y),c("sexFac","doseFac")],
budworm[rep(1:12,times=budworm$alive),c("sexFac","doseFac")])
bud2$deadFac <- factor(c(rep(1,sum(budworm$y)),rep(0,sum(budworm$alive))),levels=c(0,1),labels=c("No","Yes"))
# Tabulate the data:
ftable(doseFac~sexFac+deadFac,data=bud2)
# Plot the data:
windows(record=TRUE) # open a graphics window (use quartz() on a Mac)
op <- par(pty="s")
plot(p.dead~dose,data=budworm[male,],
main="Sample prop dead vs. dose",pch=1,
sub="Plot (a)",xlab="Dose", ylab="Prop dead")
points(budworm$dose[!male],budworm$p.dead[!male],pch=2)
legend("bottomright",pch=c(1,2),legend=c("male","female"))
plot(budworm$dose[male],budworm$samplogit[male],
main="Sample logit vs. dose",pch=1, sub="Plot (b)",
xlab="Dose",ylab="Sample logit (prop dead)")
points(budworm$dose[!male],budworm$samplogit[!male],pch=2)
legend("bottomright",pch=c(1,2),legend=c("male","female"))
plot(budworm$ldose[male],budworm$samplogit[male],
main="Sample logit vs. log2(dose)",pch=1, sub="Plot (c)",
xlab="log2(dose)",ylab="Sample logit (prop dead)")
points(budworm$ldose[!male],budworm$samplogit[!male],pch=2)
legend("bottomright",pch=c(1,2),legend=c("male","female"))
par(op)
# Now fit a main-effects model with effects of sex and dose,
# (both as factors: sexFac and doseFac). Note the use or orthogonal
# polynomial contrasts for doseFac.
bud.mainmod <- glm(cbind(dead,alive)~sexFac+doseFac,
data=budworm,
family=binomial(link="logit"),
contrasts=list(doseFac=contr.poly))
# Test goodness of fit:
gof(bud.mainmod) # adequate fit implies sexFac*doseFac interaction non-significant
# Test main effects with LR tests (could use type=3 or type=2
# because no interaction in the model):
Anova(bud.mainmod,type=2,test.statistic="LR") # Don't use anova()!
# Note that using anova() to test signifiance of sexFac and doseFac
# will give Type I tests, which will give incorrect results.
anova(bud.mainmod) # WRONG!
# Now get marginal means for each factor. These are on the scale of the
# linear predictor. So these are average log odds for sex, averaged over
# dose; and for dose, averaged over sex.
# Here I've asked for Bonferroni-adjusted 95% intervals for each
# estimated marginal mean:
(bud.mainmod.emm <- emmeans(bud.mainmod,adjust="bonferroni",
specs= list(doseMargMeans= ~ doseFac,
sexMargMeans= ~ sexFac)))
# We can also easily plot these marginal means and the associated
# confidence intervals. Here I plot the marginal means for dose:
plot(bud.mainmod.emm$doseMargMeans,adjust="bonferroni",
horizontal=FALSE,ylab="Dose",xlab="Log odds",type="link") +
ggtitle("Estimated log odds of death\nwith Bonferroni 95% CIs")
# Note that we cal back-transform these results and plot the estimates
# and intervals shown above on the probability scale just by
# adding a type="response" argument:
plot(bud.mainmod.emm$doseMargMeans,adjust="bonferroni",
horizontal=FALSE,ylab="Dose",xlab="Probability",
type="response") +
ggtitle("Estimated probability of death\nwith Bonferroni 95% CIs")
# Note that because we coded doseFac with `contr.poly()` contrasts
# when fitting the model, the coefficients for this factor can tell
# us whether linear, higher-order effects of dose are significant.
arm::display(bud.mainmod,detail=TRUE) # briefer summary than given by summary() function
# Note that the coding of doseFac with contr.poly() treated the
# dose levels as equally spaced. The levels are 1,2,4,8,16,32,
# which are, of course, not equally spaced. But they are equally
# spaced on the log_2 scale.
log2(c(1,2,4,8,16,32))
# So what I did was code doseFac in terms of linear, quadratic, cubic,
# ... effects of log_2(dose) (call this ldose).
# We see from the (asymptotic) Z tests given above by the display()
# function that the linear effect of ldose is highly significant but
# none of the quadratic and higher effects ar eindivdually significant.
# Better to test these quadratic and higher effects with a single joint
# test, which is done below (a Wald test).
# Wald test that nonlinear effects of dose are null, computed two
# different ways:
# First use the lht() function from the car package:
coef.names <- names(coef(bud.mainmod))
lht(bud.mainmod,coef.names[-c(1:3)]) # lht for Linear Hypothesis Test.
# Can also do the test as a test of contrasts on the the marginal
# means for doseFac:
test(contrast(bud.mainmod.emm$doseMargMeans,
method=as.data.frame(contr.poly(6)[,-1])),joint=T)
# Note that the F test given above w/ inf denominator df is
# equivalent to the chisq test given by lht (df1*F=Chisq)
# Also note that anove way to test the same hypothesis is with a
# likelihood ratio test. The LRT and the Wald test we computed
# above are asymptotically equivalent, but the LRT has some advantages
# (in general), so it would have been a better choice here. Here is
# the LRT of the same hypothesis (no nonlinear effects of ldose):
bud.finalmod <- glm(cbind(dead,alive)~0+sexFac+ldose,
data=budworm,family=binomial(link="logit"))
anova(bud.finalmod,bud.mainmod,test="LRT")
# The final model (bud.finalmod) has linear effects of ldose
# with different intercepts for each sex.
# We finish the example with a summary of this model and a plot of
# the fitted curves from this model for male and female budworms,
# plotted both on the log-odds scale and the probability scale:
summary(bud.finalmod)
# Now make the plots:
ldose0 <- seq(from=0,to=5,by=.1)
expit <- function(x){1/(1+exp(-x))}
betahat <- coef(bud.finalmod)
op <- par(pty="s")
plot(budworm$ldose[male],budworm$p[male],xlab="Log2(dose)",
ylab="Mortality Probability",
main="Fitted probability from model bud.finalmod",pch=1)
points(budworm$ldose[!male],budworm$p[!male],pch=2)
lines(ldose0,expit( betahat[1]+betahat[3]*ldose0),lty=2)
lines(ldose0,expit( betahat[2]+betahat[3]*ldose0),lty=1)
legend("topleft",legend=c("males","females"),
lty=c(2,1),pch=c(1,2),cex=.8)
plot(budworm$ldose[male],budworm$samplogit[male],xlab="Log2(dose)",
ylab="Log Odds",main="Fitted log odds from model bud.finalmod",
pch=1)
points(budworm$ldose[!male],budworm$samplogit[!male],pch=2)
lines(ldose0,( betahat[1]+betahat[3]*ldose0),lty=2)
lines(ldose0,( betahat[2]+betahat[3]*ldose0),lty=1)
legend("topleft",legend=c("males","females"),lty=c(2,1),pch=c(1,2),
cex=.8)
par(op)
dev.off(which=dev.cur()) # close the graphics window that we opened.
# End of script.