Growth Modelling in Ruminants
Using covariance matrices in R
In this post I will show you how I modelled the growth in ruminants using statistical models on a commercial dataset. As such, I cannot share the dataset and the data have been a little bit augment, but I will try to make my workflow as transparant as possible. If you know me, you know I like Mixed Models and so to utilize the codes I used here you just need to have a longitudinal dataset.
Lets get started.
rm(list = ls())
#### LIBRARIES ####
library(foreign)
library(lme4)
library(ggplot2)
library(rms)
library(plyr)
library(Hmisc)
library(reshape2)
library(boot)
library(sjPlot)
library(sjstats)
library(sjmisc)
library(AICcmodavg)
require(parallel)
library(gridExtra)
library(coefplot)
library(coda)
library(aods3)
library(plotMCMC)
library(bbmle)
library(nlme)
library(merTools)
library(RLRsim)
library(pbkrtest)
library(multcomp)
library(lsmeans)
library(multcompView)
library(lattice)
library(splines)
library(lmtest)
library(car)
library(corrplot)
library(PerformanceAnalytics)
library(eqs2lavaan)Growth_all <- read.csv("Growth_all.csv")
str(Growth_all)
Growth_all$Treat<-as.factor(Growth_all$Treat)
Growth_wide<-reshape(Growth_all,
idvar = c("Pen", "Treat","Block"),
timevar = "Week",
direction = "wide")
#### Exploratory summaries ####
# Not the same baseline growth
Growth_melt<-ddply(Growth_all, c("Week", "Treat"), summarise,
N = sum(!is.na(BW)),
Mis = sum(is.na(BW)),
Mean = round(mean(BW, na.rm=T),3),
Median = round(median(BW, na.rm=T),3),
SD = round(sd(BW, na.rm=T),3),
SE = round(sd(BW, na.rm=T) / sqrt(N),5),
LCI = round(Mean - (2*SE),3),
HCI = round(Mean + (2*SE),3))
When you are looking for treatment differences in a longitudinal dataset it also means that you are looking for a way to deal with the covariance matrix in that dataset. Longitudinal data is nested data and nested data has cross-correlation. These cross-correlations decrease the effective sample size of a study and hence need to be modeled. Nothing taking them into account means that you will underestimate the standard errors in the data and find p-values that are not correct (which is separate from the heavy criticism on p-values anyhow)
Lets take a look at the correlation and covariance matrices as can be deducted from the raw data itself.
cormatrix<-as.data.frame(cor(Growth_wide[,4:25])) # lot of autocorrelation
covmatrix<-as.data.frame(cov(Growth_wide[,4:25])) # Covariance not homogenous --> heterogeneity of variance
cor<- rcorr(as.matrix(Growth_wide[,4:25]))
corrplot(cor$r)
chart.Correlation(Growth_wide[,4:25], histogram=TRUE, pch=19)
plotCov(cov(Growth_wide[,4:25]))
Now, when you have a dataset including longitudinal data you can expect a heavy covariance / correlation-matrix. The heaviness depends on the measurement times chosen and hence the distance between the observations in time.
Lets graph the data.
theme_set(theme_bw())
myx<-scale_x_continuous(breaks=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,16,18,20,22,24,26,28,30))
myy<-scale_y_continuous(breaks = seq(0, 400, by = 50))
ggplot(Growth_all, aes(x=Week, y=BW, group=Pen, colour=Treat))+
myx+
myy+
geom_line() +
stat_summary(aes(group=Treat), fun.y=mean, geom="point", size=3.5)+
stat_summary(aes(group=Treat), fun.y=mean, geom="line", lwd=1.5)
## Growth boxplot
theme_set(theme_bw())
myx<-scale_x_continuous(breaks=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,16,18,20,22,24,26,28,30))
myy<-scale_y_continuous(breaks = seq(0, 400, by = 50))
ggplot(Growth_all, aes(x=Week, y=BW, group=Week, colour=Treat))+
myx+
myy+
geom_boxplot()
## Growth boxplot - Treatment
theme_set(theme_bw())
myx<-scale_x_continuous(breaks=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,16,18,20,22,24,26,28,30))
myy<-scale_y_continuous(breaks = seq(0, 400, by = 50))
ggplot(Growth_melt, aes(x=Week, y=Mean, colour=Treat))+
myx+
myy+
geom_line(lwd=1)+
geom_ribbon(aes(ymin=LCI, ymax=HCI, fill=Treat), alpha=0.1)
## Growth lineplot
sjp.poly(Growth_all$BW, Growth_all$Week, 1)
sjp.poly(Growth_all$BW, Growth_all$Week, 2)
sjp.poly(Growth_all$BW, Growth_all$Week, 3)
Now we can start with making our first set of models, assess them individually and compare them to get a feeling about important predictors.
mix1<-lmer(BW~ns(Week,2)+Treat+(ns(Week,2)|Block), data=Growth_all) # perfect correlation intercept + Week
summary(mix1)
plot(mix1)
mix0<-lmer(BW~Week+Treat+(Week|Block), data=Growth_all, REML=FALSE)
mix1<-lmer(BW~ns(Week,2)+Treat+(ns(Week,2)|Block), data=Growth_all, REML=FALSE)
mix2<-lmer(BW~ns(Week,2)*Treat+(ns(Week,2)|Block), data=Growth_all, REML=FALSE)
anova(mix1, mix2) # no treatment interaction
mix3<-lmer(BW~ns(Week,3)+Treat+(ns(Week,3)|Block), data=Growth_all, REML=FALSE)
anova(mix1,mix3) # not significant change in loglik and devianc
anova(mix0, mix1)
mix4<-lmer(BW~ns(Week,2)+Treat+(ns(Week,2)|Block/Pen), data=Growth_all, REML=FALSE)
summary(mix4) # huge variance for Week
anova(mix1, mix4) # does show a way better fit but not sure if okay
mix5<-lmer(BW~ns(Week,2)+Treat+(ns(Week,2)|Block:Pen), data=Growth_all, REML=FALSE)
anova(mix4, mix5) # huge variance for Week
summary(mix5)anova(mix1,mix2,mix3,mix4,mix5)
ICtab(mix1,mix2,mix3,mix4,mix5)
Lets also apply the lme package, which I find to be very complimentary to the lme4 package.
Growth_group_pen<-groupedData(BW~Week|Pen,
data=Growth_all,
FUN=mean)
Growth_group_blockpen<-groupedData(BW~Week|Block/Pen,
data=Growth_all,
FUN=mean)
Growth_group_block<-groupedData(BW~Week|Block,
data=Growth_all,
FUN=mean)
mix1_nlme_pen<-lme(BW~ns(Week,2)+Treat,
data=Growth_group_pen,
random=~ns(Week,2),
method="ML",
control=lmeControl(opt="optim", maxIter=10000, msMaxIter = 1000))
summary(mix1_nlme_pen)
plot(mix1_nlme_pen); plot(mix1)
plot(augPred(mix1_nlme_pen, level = 0:1))
mix1_nlme_blockpen<-lme(BW~ns(Week,2)+Treat,
data=Growth_group_blockpen,
random=~ns(Week,2),
method="ML",
control=lmeControl(opt="optim", maxIter=10000, msMaxIter = 1000))
anova(mix1_nlme_pen,mix1_nlme_blockpen)
intervals(mix1_nlme_pen)
plot(acf(resid(mix1_nlme_pen)))
plot(pacf(resid(mix1_nlme_pen)))
A way to statistically test for autocorrelation is to use the Durbin Watson Test on a standard linear regression which I am doing below. What the test does is look for significant correlations between a value and its lag values. This is autocorrelation. The null-hypothesis of the test is NO autocorrelation.
fit<-lm(BW~Week, data=Growth_all)
summary(fit)
dbtest<-durbinWatsonTest(fit, max.lag=10, simulate=TRUE, method="resample")
dbtest
The simplest, but often quite suitable covariance matrix to implement in your model to deal with autocorrelation is the AR(rho) matrix. Its a correlation matrix that assumes that the correlation between two adjacent timepoints is of rho value. Here, I specify 0.9. This means that the correlation matrix looks like this:
- Time1 — Time2 = 0.9
- Time1 — Time3 = 0.9*09
- Time1 — Time4 = 0.9*0.9*0.9
- Time2 — Time3 = 0.9
etc.
ARmatrix <- corAR1(0.9,form=~Week|Pen)
ARmatrix <- Initialize(ARmatrix, data=Growth_all)
cor_ARmatrix<-as.data.frcorMatrix(ARmatrix)
str(cor_ARmatrix)
par(mfrow = c(2, 2))
heatmap(cor_ARmatrix$`602`, main="Correlation Matrix for cow 602")
Lets apply in inside the mixed models, finding different structures and allowing heterogeneity or not.
mix2_nlme_pen<-lme(BW~ns(Week,2)+Treat,
data=Growth_group_pen,
random=~ns(Week,2),
correlation=corAR1(0.8,form=~1),
method="ML",
control=lmeControl(opt="optim", maxIter=10000, msMaxIter = 1000))
mix2_nlme_blockpen<-lme(BW~ns(Week,2)+Treat,
data=Growth_group_blockpen,
random=~ns(Week,2),
correlation=corAR1(0.8,form=~Week),
method="ML",
control=lmeControl(opt="optim", maxIter=10000, msMaxIter = 1000))
mix2.1_nlme_blockpen<-lme(BW~ns(Week,2)+Treat,
data=Growth_group_blockpen,
random=~0+Week,
correlation=corAR1(0.8,form=~Week),
method="ML",
control=lmeControl(opt="optim", maxIter=10000, msMaxIter = 1000))
mix2.2_nlme_pen<-lme(BW~ns(Week,2)+Treat+Block,
data=Growth_all,
random=~0+Week|Pen,
correlation=corAR1(0.8,form=~Week|Pen),
method="ML",
control=lmeControl(opt="optim", maxIter=10000, msMaxIter = 1000))
mix2.3_nlme_pen<-lme(BW~ns(Week,2)+Treat+Baseline+Block,
data=Growth_group_blockpen,
random=~0+Week,
correlation=corAR1(0.95, form=~Week),
method="ML",
control=lmeControl(opt="optim", maxIter=10000, msMaxIter = 1000))
mix2.4_nlme_pen<-lme(BW~ns(Week,2)+Treat+Baseline+as.factor(Block),
data=Growth_group_pen,
random=~0+Week,
correlation=corAR1(0.95, form=~Week),
method="ML",
control=lmeControl(opt="optim", maxIter=10000, msMaxIter = 1000))
mix2.5_nlme_pen<-lme(BW~ns(Week,2)+Treat+Baseline+as.factor(Block),
data=Growth_group_pen,
random=~0+Week,
correlation=corAR1(0.95, form=~Week),
method="ML",
control=lmeControl(maxIter=10000, msMaxIter = 1000))
mix2.6_nlme_pen<-lme(BW~ns(Week,3)+Treat+Baseline+as.factor(Block),
data=Growth_group_pen,
random=~0+Week,
correlation=corAR1(0.95, form=~Week),
method="ML",
control=lmeControl(maxIter=10000, msMaxIter = 1000))
mix2.7_nlme_pen<-lme(BW~ns(Week,3)+Treat+Baseline,
data=Growth_group_pen,
random=~0+Week,
correlation=corAR1(0.95, form=~Week),
method="ML",
control=lmeControl(maxIter=10000, msMaxIter = 1000))
anova(mix2.3_nlme_pen,mix2.5_nlme_pen, mix2.6_nlme_pen, mix2.7_nlme_pen)
plot(mix2.4_nlme_pen)
plot(augPred(mix2.7_nlme_pen, level=0:1), grid=T)
plot(augPred(mix2.4_nlme_pen),aspect="xy",grid=T)
pred<-augPred(mix2.4_nlme_pen, level=0:1)
anova(mix2.2_nlme_pen,mix2.4_nlme_pen) # even worse fit specyifying block
summary(mix2.4_nlme_pen)
plot(acf(resid(mix2.7_nlme_pen)))
plot(pacf(resid(mix2.7_nlme_pen)))
intervals(mix2.7_nlme_pen)
The autocorrelations plots are extremely helpful in modelling the error part of the model. There are two types of plots:
- ACF: An ACF measures and plots the average correlation between data points in a time series and previous values of the series measured for different lag lengths. For example, the correlation at the first lag is measured as the correlation between values of the time series measured at time t with all of the values for the series measured at time t − 1. The correlation at the second lag measures the correlation between values of the time series measured at time t with all of the values for the series measured at time t − 2.
- pACF: A PACF is similar to an ACF except that each correlation controls for any correlation between observations of a shorter lag length. Thus, the value for the ACF and the PACF at the first lag are the same because both measure the correlation between data points at time t with data points at time t − 1. However, at the second lag, the PACF measures the correlation between data points at time t with data points at time t − 2 after controlling for the correlation between data points at time t with those at time t − 1.
Lets look deeper into the model, diagnosing it.
plot(ranef(mix2.4_nlme_pen))
qqnorm(residuals(mix2.7_nlme_pen))
qqline(residuals(mix2.7_nlme_pen))
plot(augPred(mix2.4_nlme_pen))
pairs(mix2.3_nlme_pen)
vario<-Variogram(mix2_nlme_blockpen)
plot(vario)
plot(fitted(mix2.7_nlme_pen), residuals(mix2.7_nlme_pen)); abline(h=0)
plot(compareFits(coef(mix2.4_nlme_pen), coef(mix2.5_nlme_pen)))
VarCorr(mix2.2_nlme_pen)
VarCorr(mix2.3_nlme_pen)
VarCorr(mix2.4_nlme_pen)
mix2.7_ranef<-ranef(mix2.7_nlme_pen,augFrame = T, level=1) # works for levels 1,2,3 (if in model present) but not 0
plot(mix2.7_ranef, grid=TRUE)
# Shows variance per Pen / Block
bwplot(Growth_all$Pen ~ resid(mix2.7_nlme_pen))
bwplot(Growth_all$Block ~ resid(mix2.7_nlme_pen))
One way to check specifically if nesting is necessary for random effects is to build within-nested models and let bootstrapping simulated the Likelihood Ratio Test. Here, I am checking a model with the same fixed effects and different random effects:
- (0+Week|Block) + (0+Week|Pen:Block)
- (0+Week|Pen:Block)
- (0+Week|Block)
As you can see the models are nested.
(nc <- detectCores())
cl <- makeCluster(rep("localhost", nc))
mA<-lmer(BW~ns(Week,3)+Treat+(0+Week|Block)+(0+Week|Pen:Block), data=Growth_all, REML=TRUE)
m.block<-update(mA,.~.-(0+Week|Pen:Block))
m.0<-update(mA,.~.,-(0+Week|Block))
anova(mA, m.block, m.0)
exactRLRT(m.block)
But it is quite a model to fit. So, what if I stick with my m.block model and instead include a baseline? Yes, choices are not easy, and for many it is easier to understand a model that has a baseline covariate included instead of a sophisticated error component.
# Random blocks or baseline covariate?
blockvar<-lmer(BW~ns(Week,3)+Treat+(0+Week|Pen:Block), data=Growth_all, REML=FALSE)
baselinecovar<-lmer(BW~ns(Week,3)+Treat+Baseline+(0+Week|Pen), data=Growth_all, REML=FALSE)
anova(blockvar, baselinecovar) # best to have a baseline covariate
plot(blockvar)
plot(baselinecovar)
VarCorr(baselinecovar)
VarCorr(blockvar)
In the end, I believe I had to go with the baseline model since it was more accepted, but I cannot say the choice felt satisfactory to me back then. Lets see if the predictions make sense.
## Cow specific predictions
newdat<-expand.grid(Pen=unique(Growth_all$Pen),
Block=unique(Growth_all$Block),
Week=as.numeric(1:35),
Baseline=unique(Growth_all$Baseline),
Treat=unique(Growth_all$Treat))
dim(newdat)
Pred<-predict(mix2.7_nlme_pen, newdata = newdat, level = 0:1 )
newdat$pred.fix<-Pred$predict.fixed
newdat$pred.pen<-Pred$predict.Pen
newdat$idtreat<-paste(newdat$Pen,newdat$Treat)## Plot
myx<-scale_x_continuous(breaks=seq(1,35,by=1))
myy<-scale_y_continuous(breaks= seq(0, 400, by = 25))
ggplot(newdat, aes(x=Week, colour=Treat))+
myx+
myy+
labs(title = "Growth(BW) over 35 Weeks", y="BW", x="Time(Week)")+
theme_bw()+
stat_summary(aes(y=pred.pen, group=idtreat),fun.y=median, geom="line", lwd=0.4, alpha=0.4)+
stat_summary(aes(y=pred.fix, group=Treat),fun.y=median, geom="line", lwd=1)+
stat_summary(aes(y=pred.fix, group=Treat),fun.y=median, geom="point", lwd=2)
Now, I already said that I am using both lme and lme4 packages for linear mixed models. The lme package is older, but has more functionality in the form modelling the error term via variance-covariance matrices. At the very least, there are pre-functioned possibilities.
So, lets run a model in lme, with a modelled error, and one in lme4 without. Another difference is that the lme4 model has a nested random effect, to see if I can compensate for the lack in modeling the cross-correlation in the error term.
mix2.7_lme4_pen<-lme4::lmer(BW~ns(Week,3)+Treat+Baseline+(0+Week|Pen), data=Growth_all, REML=FALSE)
mix2.7_nlme_pen<-lme(BW~ns(Week,3)+Treat+Baseline,
data=Growth_group_pen,
random=~0+Week,
correlation=corAR1(0.95, form=~Week),
method="ML",
control=lmeControl(maxIter=10000, msMaxIter = 1000))
VarCorr(mix2.7_nlme_pen)
VarCorr(mix2.7_lme4_pen)
plot(mix2.7_nlme_pen)
plot(mix2.7_lme4_pen)
par(mfrow=c(2,2))
plot(acf(resid(mix2.7_nlme_pen)))
plot(acf(resid(mix2.7_lme4_pen)))
plot(pacf(resid(mix2.7_nlme_pen)))
plot(pacf(resid(mix2.7_lme4_pen)))
AIC(mix2.7_lme4_pen)
AIC(mix2.7_nlme_pen)
predlme4<-predict(mix2.7_nlme_pen, newdat=newdat)
ggplot(Growth_all, aes(x=Week, y=BW, colour=Treat))+
theme_bw()+
geom_point(col="grey80")+
labs(title = "Growth(BW) over 35 Weeks", y="BW", x="Time(Week)")+
stat_summary(dat=newdat, aes(y=pred.pen, group=idtreat),fun=median, geom="line", lwd=0.4, alpha=0.4)+
stat_summary(dat=newdat, aes(y=pred.fix, group=Treat, colour="nlme"),fun=median, geom="line", lwd=1)+
stat_summary(dat=newdat, aes(y=fitlme4, group=Treat, colour="lme4"),fun=median, geom="line", lwd=1)+
scale_colour_manual(name="Method", values=c(nlme="blue", lme4="red"))
So, this was a modeling-growth-in-ruminants post. Hope you enjoyed it. Let me know if you have questions!