print(pi)
## [1] 3.141593
# create a new environment in which to load data
## this prevents cluttering the workspace
data.env <- new.env()
# load data (from params) into the new environment
data(list = params$data,envir = data.env)
if(!params$rerun.errors){load("data/error-rate_binomial-models.RData")}
# define full data to use
full.dat <- data.env$full.data %>%
dplyr::select(data.env$common.insects,Genet,
Unique.ID, SerialNo, Block,
survey.event, Survey.Year,
Vol, PG, CT,
BBreakDegDay, EFNMean) %>%
na.omit()
full.y <- factor(full.dat$Petiole.Gall > 0)
## Remove genets with no replication within years
# which genets have less than 8 replicates over the 4 survey events?
genet.n <- full.dat %>% group_by(Genet) %>% tally() %>% filter(!n < 8)
full.dat <- full.dat %>% filter(Genet %in% genet.n$Genet)
full.dat <- full.dat %>% mutate(survey.month = factor(gsub(survey.event,
pattern = "(.*)(1.)",
replacement = "\\1")))
if(params$rerun.errors){
error.rate.lm <- NULL
error.rate.lm.gen <- NULL
error.rate.glm <- NULL
error.rate.glm.gen <- NULL
error.rate.lme <- NULL
error.rate.qda <- NULL
error.rate.pred.qda <- NULL
for(i in 1:params$error.iters){
# Subset the data
sub.samp <- full.dat %>%
filter(Genet %in% sample(data.env$full.data$Genet, size = 350))
n <- nrow(sub.samp)
y <- as.factor(sub.samp$Petiole.Gall > 0)
# lm
pet.lm <- lm(data = sub.samp,
formula = y ~ PG + CT +
Vol + Survey.Year
)
### Predictions
yhat.lm <- predict(pet.lm)
error.rate <- 1 - sum((yhat.lm > .05) == y)/n
error.rate.lm <- c(error.rate.lm,error.rate)
##---
# lm with genet
pet.lm.gen <- lm(data = sub.samp,
formula = y ~ PG + CT +
Vol + Survey.Year + Genet
)
### Predictions
yhat.lm.gen <- predict(pet.lm.gen)
error.rate <- 1 - sum((yhat.lm.gen > .05) == y)/n
error.rate.lm.gen <- c(error.rate.lm.gen,error.rate)
##---
# glm
pet.glm <- glm(data = sub.samp, family = "binomial",
formula = y ~ PG + CT +
Vol + Survey.Year
)
### Predictions
yhat.glm <- predict(pet.glm)
error.rate <- 1 - sum((yhat.glm > .05) == y)/n
error.rate.glm <- c(error.rate.glm,error.rate)
##---
# glm with genet
pet.glm.gen <- glm(data = sub.samp, family = "binomial",
formula = y ~ PG + CT +
Vol + Survey.Year
)
### Predictions
yhat.glm.gen <- predict(pet.glm.gen)
error.rate <- 1 - sum((yhat.glm.gen > .05) == y)/n
error.rate.glm.gen <- c(error.rate.glm.gen,error.rate)
##---
# lme with genet as random effect
pet.lme <- glmer(data = sub.samp, family = binomial,
formula = y ~ PG + CT +
Vol + Survey.Year + (1|Genet)
)
### Predictions
yhat.lme <- predict(pet.lme)
error.rate <- 1 - sum((yhat.lme > .05) == y)/n
error.rate.lme <- c(error.rate.lme,error.rate)
##---
# lme with genet as random effect
pet.qda <- qda(data = sub.samp, (Petiole.Gall > 0) ~ PG + CT +
Vol + Survey.Year
)
### Predictions
yhat.qda <- predict(pet.qda)
error.rate <- 1 - sum(yhat.qda$class == y)/n
error.rate.qda <- c(error.rate.qda,error.rate)
##---
# Now predict to the FULL data with QDA
pred.y.qda <- predict(pet.qda,full.dat)
error.rate <- 1 - sum(pred.y.qda$class == full.y)/nrow(full.dat)
error.rate.pred.qda <- c(error.rate.pred.qda,error.rate)
print(i)
}
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
if(params$rerun.errors){
error.rate.lme.yr <- NULL
for(i in 1:8){
# Subset the data
sub.samp <- full.dat %>%
filter(Genet %in% sample(data.env$full.data$Genet, size = 350))
n <- nrow(sub.samp)
y <- as.factor(sub.samp$Petiole.Gall > 0)
# lme with genet as random effect
pet.lme.yr <- glmer(data = sub.samp, family = binomial,
formula = y ~ PG + CT +
Vol + (1|Genet) + (1|Survey.Year)
)
### Predictions
yhat.lme.yr <- predict(pet.lme.yr)
error.rate <- 1 - sum((yhat.lme.yr > .05) == y)/n
error.rate.lme.yr <- c(error.rate.lme.yr,error.rate)
##---
print(i)
}
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
if(params$rerun.errors){
error.rate.big <- NULL
# for(i in 1:4){# big glm
system.time(advanced.glm <- glmer(data = sub.samp,
family = binomial,
# control = glmerControl(optimizer = "bobyqa"),
formula = factor(Petiole.Gall > 0) ~
CT + PG + Vol +
BBreakDegDay + survey.month +
Survey.Year + (1|Genet)
))
### Predictions
yhat.big <- predict(advanced.glm, type = "response")
error.rate <- 1 - (sum((yhat.big > .05) ==
y))/n
error.rate.big <- c(error.rate.big,error.rate)
# error.absent <- 1 - sum((yhat.big[which(full.y == FALSE)] > 0) == full.y[which(full.y == FALSE)])/length(full.y[which(full.y == FALSE)])
# error.present <- 1 - sum((yhat.big[which(full.y == TRUE)] > 0) == full.y[which(full.y == TRUE)])/length(full.y[which(full.y == TRUE)])
##---
# print(i)}
}
# summary(advanced.glm)
# boxplot(error.rate.big,error.absent,error.present)
mod.error.data <- data.frame(
error = c(error.rate.lm,
error.rate.lm.gen,
error.rate.glm,
error.rate.glm.gen,
error.rate.qda,
error.rate.pred.qda,
error.rate.lme),
method = rep(c("lm","lm.gen","glm","glm.gen","qda","qda.pred","glme"),each = params$error.iters)
)
mod.error.data <- rbind(mod.error.data,data.frame(error = error.rate.lme.yr,
method = "glme.yr.gen"))
mod.error.data <- rbind(mod.error.data,data.frame(error = error.rate.big,
method = "glm.big"))
ggplot(data = mod.error.data, aes(x = method, y = error, fill = method)) +
geom_boxplot() + theme_dark()
if(params$rerun.errors){
save(list = (c("error.rate.lm",
"error.rate.lm.gen",
"error.rate.glm",
"error.rate.glm.gen",
"error.rate.qda",
"error.rate.pred.qda",
"error.rate.lme",
"error.rate.lme.yr",
"error.rate.big",
# table of above
"mod.error.data",
# the models
"pet.lm","pet.glm","pet.glm.gen","pet.lm.gen",
"pet.lme","pet.lme.yr","pet.qda","advanced.glm"
)),
file = "data/error-rate_binomial-models.RData")
}
We can see that GLMs have a much lower over-all error rate than the other types.
I’ll test out the best ways to make GLMMs work:
# reload the data
load("data/r-objects.RData")
# define the variables to use in the models
variables <- c(common.insects,"Vol","CT","PG","survey.month",
"Genet", "Block","Survey.Year",
"BBreakDegDay","EFNMean")
rand.vars <- "Genet"
y.var <- "Harmandia"
# define full data to use
full.dat <- full.data %>%
dplyr::select(common.insects,Genet,
Unique.ID, SerialNo, Block,
survey.event, Survey.Year,
Vol, PG, CT,
BBreakDegDay, EFNMean, Min.per.Tree) %>%
na.omit()
# define response variable
full.y <- full.dat[,y.var] > 0
## change survey event to survey month (jun/aug)
full.dat <- full.dat %>%
mutate(survey.month = factor(gsub(survey.event,
pattern = "(.*)(1.)",
replacement = "\\1"))
)
## define the fixed effects
fixed.vars <- variables[! variables %in% c(common.insects, rand.vars)]
full.formula <- formula(paste("factor(",y.var, "> 0) ~", # response
paste(fixed.vars, collapse = "+"), # fixefs
"+ (1 | Genet)")) #ranefs
if(!params$run.mods){load("data/initial-modeling-objects.RData")}
model.save.list <- NULL
#Full Model
## Fit the full model using glmer's defaults
if(params$run.mods){
mod.1 <- glmer(formula = full.formula, data = full.dat, family = "binomial")
}
## print the model summary
summary(mod.1, corr = FALSE)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: factor(Harmandia > 0) ~ Vol + CT + PG + survey.month + Block +
## Survey.Year + BBreakDegDay + EFNMean + (1 | Genet)
## Data: full.dat
##
## AIC BIC logLik deviance df.resid
## 5989.3 6070.0 -2982.6 5965.3 6167
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.0869 -0.5213 -0.3802 -0.2430 4.3397
##
## Random effects:
## Groups Name Variance Std.Dev.
## Genet (Intercept) 0.3361 0.5797
## Number of obs: 6179, groups: Genet, 490
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.591820 0.360329 -15.519 < 2e-16 ***
## Vol 0.013596 0.001704 7.977 1.5e-15 ***
## CT -0.007433 0.013572 -0.548 0.583906
## PG -0.005372 0.014806 -0.363 0.716755
## survey.monthjun 0.621005 0.072787 8.532 < 2e-16 ***
## Block2 0.081668 0.097574 0.837 0.402602
## Block3 0.250067 0.096560 2.590 0.009605 **
## Block4 0.319777 0.096325 3.320 0.000901 ***
## Survey.Year2017 -0.873173 0.083898 -10.408 < 2e-16 ***
## BBreakDegDay 0.016120 0.001339 12.038 < 2e-16 ***
## EFNMean -0.132759 0.093140 -1.425 0.154052
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## convergence code: 0
## Model failed to converge with max|grad| = 0.126957 (tol = 0.001, component 1)
## Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?
## Model is nearly unidentifiable: large eigenvalue ratio
## - Rescale variables?
## use all optimizers to fit the model and compare them
# optim.1 <- allFit(mod.1)
##
# mod.select <- drop1(mod.1)
model.save.list <- c(model.save.list, "mod.1")
Let’s re-optimize the model to fix any convergence issues. The steps we will take were suggested by Ben Bolker and include, 1. reduce the stopping tolerance, 2. rescale the variables, 3. Recompute gradient and Hessian, 4. Restart the fit from starting points equal to the estimated parameters from the first fit.
These steps ensure that model truly is maximizing the likelihood of the parameter values to the best of the abilities of all available algorithms and methods.
# Trouble shoot convergence issues: `?convergence()`
model.opt <- function(mod, scaled.data, strict = FALSE){
stopifnot(isGLMM(mod) | isLMM(mod))
# SC.DAT <- deparse(substitute(scaled.data))
## 1. decrease stopping tolerances
strict_tol <- lmerControl(optCtrl=list(xtol_abs=1e-8, ftol_abs=1e-8))
if (all(mod@optinfo$optimizer == "nloptwrap")) {
mod <- update(mod, control=strict_tol)
}
## fit the model to scaled data
scale.mod <- update(mod, data = scaled.data)
## 3. recompute gradient and Hessian with Richardson extrapolation
devfun <- update(scale.mod, devFunOnly = TRUE)
if (isLMM(scale.mod)) {
pars <- getME(scale.mod,"theta")
} else {
## GLMM: requires both random and fixed parameters
pars <- getME(scale.mod, c("theta","fixef"))
}
if (require("numDeriv")) {
hess <- hessian(devfun, unlist(pars))
grad <- grad(devfun, unlist(pars))
scgrad <- solve(chol(hess), grad)
}
## compare with internal calculations:
# scale.mod@optinfo$derivs # looked nearly identical
## 4. restart the fit from the original value (or
## a slightly perturbed value):
scale.mod.restart <- update(scale.mod, start = pars)
if(strict){
restart.strict <- update(scale.mod, start = pars, control = strict_tol)
} else{restart.strict <- NULL}
# set.seed(101)
# pars_x <- runif(length(pars),pars/1.01,pars*1.01)
# scale.mod.restart2 <- update(scale.mod, start=pars_x,
# control = strict_tol)
return(list(scale.mod, scale.mod.restart, pars,mod.tol = mod, restart.strict))
}
model.save.list <- c(model.save.list,"model.opt")
## rescale Data
## in order to fix the eigenvalue warning, I'm going to rescale the data
scaled.dat <- transform(full.dat,
Vol = scale(Vol^(1/3)),
CT = scale(CT),
PG = scale(PG),
BBreakDegDay = scale(BBreakDegDay),
EFNMean = scale(EFNMean)
)
if(params$run.mods){
pres.mod.opt <- model.opt(mod = mod.1,scaled.data = scaled.dat)
}
mod.scl.1 <- pres.mod.opt[[1]]
mod.scl.1.restart <- pres.mod.opt[[2]]
summary(mod.scl.1.restart, corr = FALSE)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: factor(Harmandia > 0) ~ Vol + CT + PG + survey.month + Block +
## Survey.Year + BBreakDegDay + EFNMean + (1 | Genet)
## Data: scaled.data
##
## AIC BIC logLik deviance df.resid
## 5961.3 6042.0 -2968.6 5937.3 6167
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.1850 -0.5192 -0.3756 -0.2350 4.9215
##
## Random effects:
## Groups Name Variance Std.Dev.
## Genet (Intercept) 0.3425 0.5852
## Number of obs: 6179, groups: Genet, 490
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.486162 0.093294 -15.930 < 2e-16 ***
## Vol 0.449109 0.048158 9.326 < 2e-16 ***
## CT 0.016266 0.044503 0.365 0.71474
## PG 0.007611 0.044304 0.172 0.86361
## survey.monthjun 0.597560 0.073190 8.165 3.23e-16 ***
## Block2 0.070013 0.097842 0.716 0.47426
## Block3 0.240398 0.096738 2.485 0.01295 *
## Block4 0.311018 0.096593 3.220 0.00128 **
## Survey.Year2017 -0.946862 0.084800 -11.166 < 2e-16 ***
## BBreakDegDay 0.546381 0.043906 12.444 < 2e-16 ***
## EFNMean -0.048123 0.039981 -1.204 0.22873
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model.save.list <- c(model.save.list,"pres.mod.opt")
The interpretation of this model is different than the above model because the data have been rescaled. This model has converged and has stable parameters With approximate confidence intervals. Bootstrapping could be done to improve the accuracy of the confidence intervals but this will be very computationally expensive.
For now, we’ll use this re-optimized model that uses scaled data.
ggplot(data = NULL, aes(y = resid(mod.scl.1),
x = fitted(mod.scl.1.restart)))+
geom_point() + geom_smooth(method = "auto")
The residual plot from our model seems reasonable for a logistic model.
Here we’ll show the categorization error rates of the model. That is, what proportion of observations does the model correctly predict?
## Check how well it predicts things
y.hat <- (predict(mod.scl.1, type = "response") >= .5)
error.all <- 1 - sum(y.hat == full.y)/nrow(full.dat)
error.true <- 1 - sum(y.hat[full.y] ==
full.y[full.y])/length(full.y[full.y])
error.false <- 1 - sum(y.hat[!full.y] ==
full.y[!full.y])/length(full.y[!full.y])
## print the error rates of our model
errors <- data.frame(value = c(error.all,error.true,error.false),
type = c("overall error", "'true' error", "'false' error"))
ggplot(data = errors, aes(x = type, y = value,col = type,
fill = type)) +
geom_bar(stat = "identity")+
theme(axis.text.x = element_blank())+
labs(y = "error rate")
We see that our model has a very low over-all error rate. It correctly predicts the true state of our variable 79.24% of the time. However, we can see that the model performs poorly among observations whose true sate was TRUE
(i.e. the insect was found on the tree). This group is only predicted correctly 10.81% of the time. This problem occurs because there are far more observations where the insect was not present as shown here:
ggplot(full.dat, aes(x = Unique.ID,
y = jitter(as.numeric(get(y.var) > 0))))+
geom_point(aes(col = (get(y.var) > 0))) +
facet_wrap(~ survey.event)
This means that for every combination of covariates, the model has far fewer training data for TRUE
than it did for FALSE
. It is possible to tweak this prediction by shifting the classification decision boundary (currently at prob = .05) but this will come at the loss of predictive power for FALSE
group. It comes down to the question of “what prediction is most important?” It seems to me that we are much more interested in what prevents insects from being present and less about when they will be present, especially since there is a strong random component to this.
Let’s look at this further by shifting the classification boundary to 0.3:
## Check how well it predicts things
y.hat.s <- (predict(mod.scl.1, type = "response") >=.3)
error.all.s<- 1 - sum(y.hat.s == full.y)/nrow(full.dat)
error.true.s <- 1 - sum(y.hat.s[full.y] ==
full.y[full.y])/length(full.y[full.y])
error.false.s <- 1 - sum(y.hat.s[!full.y] ==
full.y[!full.y])/length(full.y[!full.y])
## print the error rates of our model
errors.s <- data.frame(value = c(error.all.s,error.true.s,error.false.s),
type = c("overall error", "'true' error", "'false' error"))
ggplot(data = errors.s, aes(x = type, y = value,col = type,
fill = type)) +
geom_bar(stat = "identity")+
theme(axis.text.x = element_blank())+
labs(y = "error rate")
Now we’ve reduced our error rate in the truly present group. Now we correctly predict presence of the insect 48.7% of the time but we’ve reduced our predictive power for the other group. We now classify accurately 85.16, 79.24% of the time for the absent class and all samples respectively.
Here we further visualize the error rates:
# layout(matrix())
plot((predict(mod.scl.1.restart, type = "response")) ~
jitter(as.numeric(full.y),amount = .45),
ylab = "predicted presence probability",
xlab = "observed presence");abline(h = c(0.5,0.3),
v = 0.5,
col = c("red","green","red"))
plot(jitter(as.numeric(y.hat),amount = .45) ~
jitter(as.numeric(full.y),amount = .45),
ylab = "predicted presence",
xlab = "observed presence",
main = "classification boundary at 0.5");abline(h = .5, v = 0.5, col = "red")
plot(jitter(as.numeric(predict(mod.scl.1.restart, type = "response") > .3),
amount = .25) ~
jitter(as.numeric(full.y),amount = .45),
ylab = "predicted presence",
xlab = "observed presence",
main = "classification boundary at 0.3");abline(h = .3, v = 0.5, col = "red")
caption <- "predicted presence vs. observed presence: Both plots show the
predicted presence paired with the observed presence. The x-axis
in both plots shows a spread out version of the observed presence
(1 = present, 2 = absent) of the insect. The y-axis of the top
plot displays the predicted probability that an observation belongs
to the 'present' class whereas the y-axis of the next 2 plots show
the predicted class, using a decision boundary cut-off of 0.5 and 0.3 respectively."
predicted presence vs. observed presence: the above plots show the predicted presence paired with the observed presence. The x-axis in both plots shows a spread out version of the observed presence (1 = present, 2 = absent) of the insect. The y-axis of the top plot displays the predicted probability that an observation belongs to the ‘present’ class whereas the y-axis of the next 2 plots show the predicted class, using a decision boundary cut-off of 0.5 and 0.3 respectively.
The plots above can give us a visual representation of the error rates mentioned previously. Points in the lower-left and upper-right quadrants are observations whose class the model correctly predicts while the upper-left and lower-right quadrants show observations whose class were predicted incorrectly by the model.
We can see that our 3rd plot makes more liberal classifications concerning presence. We can interpret this new cut-off with the following statement. “We will predict a tree as having if the probability of it being present is \(\geq0.3\) and we will predict that a tree does not have the insect if the probability of the insect being present is \(\leq0.7\).”
scaled.data <- scaled.dat
if(params$run.mods){
# time as random effects
rand.time.mod <- update(mod.scl.1,
formula = . ~ Vol + CT + PG + Block +
EFNMean + BBreakDegDay +
(1 | Genet) + (1 | Survey.Year/survey.month))
##optimize
rand.opt <- model.opt(rand.time.mod, scaled.data = scaled.data)
# time and block as random
rand.time.block.mod <- update(mod.scl.1,
formula = . ~ Vol + CT + PG +
EFNMean + BBreakDegDay +
(1 | Genet) + (1 | Survey.Year/survey.month) +
(1 | Block))
## optimize
rand.tb.opt <- model.opt(rand.time.block.mod, scaled.data = scaled.data)
}
rand.time.restart <- rand.opt[[2]]
rand.time.block.restart <- rand.tb.opt[[2]]
anova(rand.time.mod,
rand.time.block.mod,
rand.time.restart,
rand.time.block.restart,
mod.scl.1.restart)
## Data: scaled.data
## Models:
## rand.time.block.mod: factor(Harmandia > 0) ~ Vol + CT + PG + EFNMean + BBreakDegDay +
## rand.time.block.mod: (1 | Genet) + (1 | Survey.Year/survey.month) + (1 | Block)
## rand.time.block.restart: factor(Harmandia > 0) ~ Vol + CT + PG + EFNMean + BBreakDegDay +
## rand.time.block.restart: (1 | Genet) + (1 | Survey.Year/survey.month) + (1 | Block)
## rand.time.mod: factor(Harmandia > 0) ~ Vol + CT + PG + Block + EFNMean + BBreakDegDay +
## rand.time.mod: (1 | Genet) + (1 | Survey.Year/survey.month)
## rand.time.restart: factor(Harmandia > 0) ~ Vol + CT + PG + Block + EFNMean + BBreakDegDay +
## rand.time.restart: (1 | Genet) + (1 | Survey.Year/survey.month)
## mod.scl.1.restart: factor(Harmandia > 0) ~ Vol + CT + PG + survey.month + Block +
## mod.scl.1.restart: Survey.Year + BBreakDegDay + EFNMean + (1 | Genet)
## Df AIC BIC logLik deviance Chisq Chi Df
## rand.time.block.mod 10 5981.6 6048.9 -2980.8 5961.6
## rand.time.block.restart 10 5981.6 6048.9 -2980.8 5961.6 0.0000 0
## rand.time.mod 12 5977.9 6058.6 -2976.9 5953.9 7.6694 2
## rand.time.restart 12 5977.9 6058.6 -2976.9 5953.9 0.0000 0
## mod.scl.1.restart 12 5961.3 6042.0 -2968.6 5937.3 16.6111 0
## Pr(>Chisq)
## rand.time.block.mod
## rand.time.block.restart < 2e-16 ***
## rand.time.mod 0.02161 *
## rand.time.restart < 2e-16 ***
## mod.scl.1.restart < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model.save.list <- c(model.save.list, "rand.time.mod","rand.opt",
"rand.time.block.mod","rand.tb.opt")
After optimizing the models with random block and time effects, we can see that the model with only genet as a random effect fits the data best (lowest AIC and largest log-likelihood).
summary(mod.scl.1.restart, corr = FALSE)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: factor(Harmandia > 0) ~ Vol + CT + PG + survey.month + Block +
## Survey.Year + BBreakDegDay + EFNMean + (1 | Genet)
## Data: scaled.data
##
## AIC BIC logLik deviance df.resid
## 5961.3 6042.0 -2968.6 5937.3 6167
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.1850 -0.5192 -0.3756 -0.2350 4.9215
##
## Random effects:
## Groups Name Variance Std.Dev.
## Genet (Intercept) 0.3425 0.5852
## Number of obs: 6179, groups: Genet, 490
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.486162 0.093294 -15.930 < 2e-16 ***
## Vol 0.449109 0.048158 9.326 < 2e-16 ***
## CT 0.016266 0.044503 0.365 0.71474
## PG 0.007611 0.044304 0.172 0.86361
## survey.monthjun 0.597560 0.073190 8.165 3.23e-16 ***
## Block2 0.070013 0.097842 0.716 0.47426
## Block3 0.240398 0.096738 2.485 0.01295 *
## Block4 0.311018 0.096593 3.220 0.00128 **
## Survey.Year2017 -0.946862 0.084800 -11.166 < 2e-16 ***
## BBreakDegDay 0.546381 0.043906 12.444 < 2e-16 ***
## EFNMean -0.048123 0.039981 -1.204 0.22873
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(mod.scl.1.restart)
## Analysis of Deviance Table (Type II Wald chisquare tests)
##
## Response: factor(Harmandia > 0)
## Chisq Df Pr(>Chisq)
## Vol 86.9691 1 < 2.2e-16 ***
## CT 0.1336 1 0.714741
## PG 0.0295 1 0.863606
## survey.month 66.6597 1 3.227e-16 ***
## Block 13.3987 3 0.003849 **
## Survey.Year 124.6747 1 < 2.2e-16 ***
## BBreakDegDay 154.8651 1 < 2.2e-16 ***
## EFNMean 1.4488 1 0.228726
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Now we’ll model the abundance of Harmandia given that it is present on a tree.
First, I’m going to try poisson regression
Note that, because our response variable has no 0s and poisson does, we’ll subtract 1 from the response variable in the model.
# only rows where the insect was found
abun.dat <- full.dat %>% filter(get(y.var) > 0) %>% na.omit()
# modify the full formula for the continuous case
abun.formula <- update(full.formula, get(y.var) - 1 ~ .)
# rescale the numeric variables
scaled.data <- abun.dat.scl <- transform(abun.dat,
Vol = scale(Vol^(1/3)),
CT = scale(CT),
PG = scale(PG),
BBreakDegDay = scale(BBreakDegDay),
EFNMean = scale(EFNMean)
)
if(params$run.mods){
# build the abundance model
abun.mod <- glmer(formula = abun.formula, family = "poisson", data = abun.dat, offset = offset(Min.per.Tree))
# optimize the models
abun.mod.opt <- model.opt(mod = abun.mod, scaled.dat = scaled.data)
}
abun.mod.scl <- abun.mod.opt[[1]]
abun.mod.scl.restart <- abun.mod.opt[[2]]
model.save.list <- c(model.save.list,"abun.mod","abun.mod.opt")
summary(abun.mod.scl, corr = FALSE)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula:
## get(y.var) - 1 ~ Vol + CT + PG + survey.month + Block + Survey.Year +
## BBreakDegDay + EFNMean + (1 | Genet)
## Data: scaled.data
## Offset: offset(Min.per.Tree)
##
## AIC BIC logLik deviance df.resid
## 8183.9 8246.3 -4079.9 8159.9 1329
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.354 -0.867 -0.294 1.109 281.720
##
## Random effects:
## Groups Name Variance Std.Dev.
## Genet (Intercept) 2.115 1.454
## Number of obs: 1341, groups: Genet, 397
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.12494 0.11294 -63.087 < 2e-16 ***
## Vol -1.71118 0.04858 -35.222 < 2e-16 ***
## CT -0.10675 0.03944 -2.707 0.006797 **
## PG -0.10402 0.04416 -2.356 0.018493 *
## survey.monthjun 0.54951 0.05023 10.939 < 2e-16 ***
## Block2 0.35539 0.08284 4.290 1.78e-05 ***
## Block3 0.40968 0.07996 5.124 3.00e-07 ***
## Block4 0.45346 0.08140 5.571 2.53e-08 ***
## Survey.Year2017 0.18016 0.07246 2.486 0.012903 *
## BBreakDegDay 0.10770 0.05334 2.019 0.043477 *
## EFNMean 0.14820 0.03894 3.806 0.000141 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## convergence code: 0
## Model failed to converge with max|grad| = 0.0844753 (tol = 0.001, component 1)
summary(abun.mod.scl.restart, corr = FALSE)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula:
## get(y.var) - 1 ~ Vol + CT + PG + survey.month + Block + Survey.Year +
## BBreakDegDay + EFNMean + (1 | Genet)
## Data: scaled.data
## Offset: offset(Min.per.Tree)
##
## AIC BIC logLik deviance df.resid
## 8183.9 8246.3 -4079.9 8159.9 1329
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.354 -0.866 -0.294 1.109 281.625
##
## Random effects:
## Groups Name Variance Std.Dev.
## Genet (Intercept) 2.117 1.455
## Number of obs: 1341, groups: Genet, 397
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.12466 0.11296 -63.071 < 2e-16 ***
## Vol -1.71209 0.04859 -35.237 < 2e-16 ***
## CT -0.10770 0.03944 -2.730 0.006324 **
## PG -0.10452 0.04417 -2.367 0.017956 *
## survey.monthjun 0.55044 0.05024 10.956 < 2e-16 ***
## Block2 0.35576 0.08283 4.295 1.75e-05 ***
## Block3 0.40775 0.07995 5.100 3.40e-07 ***
## Block4 0.45138 0.08139 5.546 2.93e-08 ***
## Survey.Year2017 0.17913 0.07246 2.472 0.013435 *
## BBreakDegDay 0.10694 0.05335 2.004 0.045030 *
## EFNMean 0.15040 0.03894 3.862 0.000112 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(abun.mod)
## Analysis of Deviance Table (Type II Wald chisquare tests)
##
## Response: get(y.var) - 1
## Chisq Df Pr(>Chisq)
## Vol 652.1050 1 < 2.2e-16 ***
## CT 0.6599 1 0.4165886
## PG 5.6382 1 0.0175737 *
## survey.month 102.1630 1 < 2.2e-16 ***
## Block 30.5272 3 1.069e-06 ***
## Survey.Year 6.1987 1 0.0127842 *
## BBreakDegDay 12.5193 1 0.0004028 ***
## EFNMean 20.3219 1 6.545e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(abun.mod.scl)
## Analysis of Deviance Table (Type II Wald chisquare tests)
##
## Response: get(y.var) - 1
## Chisq Df Pr(>Chisq)
## Vol 1240.5701 1 < 2.2e-16 ***
## CT 7.3259 1 0.0067970 **
## PG 5.5488 1 0.0184934 *
## survey.month 119.6644 1 < 2.2e-16 ***
## Block 37.3030 3 3.97e-08 ***
## Survey.Year 6.1823 1 0.0129034 *
## BBreakDegDay 4.0768 1 0.0434769 *
## EFNMean 14.4834 1 0.0001414 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(abun.mod.scl.restart)
## Analysis of Deviance Table (Type II Wald chisquare tests)
##
## Response: get(y.var) - 1
## Chisq Df Pr(>Chisq)
## Vol 1241.6691 1 < 2.2e-16 ***
## CT 7.4555 1 0.0063244 **
## PG 5.6004 1 0.0179564 *
## survey.month 120.0307 1 < 2.2e-16 ***
## Block 37.0197 3 4.557e-08 ***
## Survey.Year 6.1109 1 0.0134348 *
## BBreakDegDay 4.0175 1 0.0450297 *
## EFNMean 14.9146 1 0.0001125 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(abun.mod.scl, abun.mod.scl.restart)
## Data: scaled.data
## Models:
## abun.mod.scl: get(y.var) - 1 ~ Vol + CT + PG + survey.month + Block + Survey.Year +
## abun.mod.scl: BBreakDegDay + EFNMean + (1 | Genet)
## abun.mod.scl.restart: get(y.var) - 1 ~ Vol + CT + PG + survey.month + Block + Survey.Year +
## abun.mod.scl.restart: BBreakDegDay + EFNMean + (1 | Genet)
## Df AIC BIC logLik deviance Chisq Chi Df
## abun.mod.scl 12 8183.9 8246.3 -4079.9 8159.9
## abun.mod.scl.restart 12 8183.9 8246.3 -4079.9 8159.9 0.0076 0
## Pr(>Chisq)
## abun.mod.scl
## abun.mod.scl.restart < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# pearson residuals by fitted value
# plot(abun.mod.scl.restart)
# extract pearson residuals
resid.pois <- residuals(abun.mod.scl.restart, type = "deviance")
# look at a qqplot of the residuals
qqnorm(resid.pois, main = "QQplot - full");qqline(resid.pois, col = "red")
qqnorm(resid.pois,ylim = c(-5,50),
main = "QQplot - zoomed");qqline(resid.pois, col = "red")
plot(abun.dat.scl$Vol, resid.pois, xlab = "vol^(1/3)",ylab = "deviance residuals")
# plot(abun.dat.scl$fitted, resid.pois);
# abline(h = 0, col = "red")
abun.dat.scl$preds <- predict(abun.mod.scl.restart)
# plot(fitted ~ preds, data = abun.dat.scl)
Next we’ll see how our model performs prediction-wise
(MSE<- mean(residuals(abun.mod.scl.restart)^2))
## [1] 3.503099
The model has an MSE (Which is the variance of our estimator) of 3.5. Another way of thinking about this is that \(\sqrt{MSE}\) is essentially the standard deviation of the estimator which, in this case is 1.87
Next we’ll look at the distribution of our data and of the fitted values from the model. We assume that the count data comes from a Poisson model with a center parameter equal to the mean # of individuals observed.
lamb = round(mean(abun.dat.scl[,y.var] - 1))
hist(abun.dat.scl[,y.var] - 1, probability = TRUE, breaks = 20,
col = rgb(1,0,0,.4),
ylim = c(0,max(dpois(0:20,lamb)+.1)),
main = "Observed Data",
xlab = paste(y.var,"Abundance - 1"))
lines(y = dpois(0:20,lambda = lamb),x = 0:20, col = "red",lwd = 1.5)
legend(x = "topright",legend = paste0("dpois(lambda=",lamb,")"),col = "red",lwd = 1.5)
# curve(from = 0, to = 20,dpois(x, lambda = lamb),add = TRUE, col = "red")
hist(fitted(abun.mod.scl.restart),probability = TRUE,
col = rgb(0,0,1,.4), add = FALSE,
ylim = c(0,max(dpois(0:20,lamb)+.1)),
main = "Fitted Values",
xlab = paste("Fitted",y.var,"Abundance"))
lines(y = dpois(0:20,lambda = lamb),x = 0:20, col = "red",lwd = 1.5)
legend(x = "topright",legend = paste0("dpois(lambda=",lamb,")"),col = "red",lwd = 1.5)
# curve(from = 0, to = 20,dpois(x, lambda = lamb),add = TRUE, col = "red")
Both our observed data and the predicted data seem to fit same poisson distribution quite well. It should be noted that, because the data have no zeros, the model is predicting zeros. To account for this, shifting the fitted values up 1 seems to be effective.
abun.dat.scl$fitted <- fitted(abun.mod.scl.restart) + 1
ggplot(data = abun.dat.scl, aes(x = Min.per.Tree, y = get(y.var)))+
geom_point()+
geom_point(aes(y = fitted, x = Min.per.Tree + .2), col = "red")
fit.mod <- lm(fitted ~ get(y.var), data = abun.dat.scl)
# summary(fit.mod)
plot(fitted ~ get(y.var), data = abun.dat.scl)
abline(fit.mod, col = "red");abline(a = 0, b = 1, col = "blue", lty = "dashed")
Based on diagnostic plots from the Poisson model, the negative binomial regression may fit slightly better.
# save the model objects
save(list = c(unique(model.save.list)),
file = "data/initial-modeling-objects.RData")