print(pi)
## [1] 3.141593
# create a new environment in which to load data
  ## this prevents cluttering the workspace
data.env <- new.env()
# load data (from params) into the new environment
data(list = params$data,envir = data.env)

if(!params$rerun.errors){load("data/error-rate_binomial-models.RData")}

Test and Compare Models for Presence

# define full data to use
full.dat <- data.env$full.data %>% 
  dplyr::select(data.env$common.insects,Genet, 
         Unique.ID, SerialNo, Block,
         survey.event, Survey.Year,
         Vol, PG, CT, 
         BBreakDegDay, EFNMean) %>% 
  na.omit()
full.y <- factor(full.dat$Petiole.Gall > 0)

## Remove genets with no replication within years
# which genets have less than 8 replicates over the 4 survey events?
genet.n <- full.dat %>% group_by(Genet) %>% tally() %>% filter(!n < 8)
full.dat <- full.dat %>% filter(Genet %in% genet.n$Genet)
full.dat <- full.dat %>% mutate(survey.month = factor(gsub(survey.event, 
                                        pattern = "(.*)(1.)",
                                        replacement = "\\1")))
if(params$rerun.errors){
  error.rate.lm <- NULL
  error.rate.lm.gen <- NULL
  error.rate.glm <- NULL
  error.rate.glm.gen <- NULL
  error.rate.lme <- NULL
  error.rate.qda <- NULL
  error.rate.pred.qda <- NULL
  
  for(i in 1:params$error.iters){
    # Subset the data
    sub.samp <- full.dat %>% 
      filter(Genet %in% sample(data.env$full.data$Genet, size = 350))
    n <- nrow(sub.samp)
    y <- as.factor(sub.samp$Petiole.Gall > 0)
    
    # lm  
    pet.lm <- lm(data = sub.samp, 
                 formula = y ~ PG + CT +
                           Vol + Survey.Year
                 )
    ### Predictions
    yhat.lm <- predict(pet.lm)
    error.rate <- 1 - sum((yhat.lm > .05) == y)/n
    error.rate.lm <- c(error.rate.lm,error.rate)
    ##---
    
    # lm with genet
    pet.lm.gen <- lm(data = sub.samp,
                 formula = y ~ PG + CT +
                           Vol + Survey.Year + Genet
                 )
    ### Predictions
    yhat.lm.gen <- predict(pet.lm.gen)
    error.rate <- 1 - sum((yhat.lm.gen > .05) == y)/n
    error.rate.lm.gen <- c(error.rate.lm.gen,error.rate)
    ##---
    
    # glm  
    pet.glm <- glm(data = sub.samp, family = "binomial",
                 formula = y ~ PG + CT +
                           Vol + Survey.Year
                 )
    ### Predictions
    yhat.glm <- predict(pet.glm)
    error.rate <- 1 - sum((yhat.glm > .05) == y)/n
    error.rate.glm <- c(error.rate.glm,error.rate)
    ##---
    
    # glm with genet
    pet.glm.gen <- glm(data = sub.samp, family = "binomial",
                 formula = y ~ PG + CT +
                           Vol + Survey.Year
                 )
    ### Predictions
    yhat.glm.gen <- predict(pet.glm.gen)
    error.rate <- 1 - sum((yhat.glm.gen > .05) == y)/n
    error.rate.glm.gen <- c(error.rate.glm.gen,error.rate)
    ##---
    
    
    # lme with genet as random effect
    pet.lme <- glmer(data = sub.samp, family = binomial,
                 formula = y ~ PG + CT +
                           Vol + Survey.Year + (1|Genet)
                 )
    ### Predictions
    yhat.lme <- predict(pet.lme)
    error.rate <- 1 - sum((yhat.lme > .05) == y)/n
    error.rate.lme <- c(error.rate.lme,error.rate)
    ##---
    
    # lme with genet as random effect
    pet.qda <- qda(data = sub.samp, (Petiole.Gall > 0) ~ PG + CT +
                           Vol + Survey.Year
                 )
    
    ### Predictions
    yhat.qda <- predict(pet.qda)
    error.rate <- 1 - sum(yhat.qda$class == y)/n
    error.rate.qda <- c(error.rate.qda,error.rate)
    ##---
    
    
    # Now predict to the FULL data with QDA
     pred.y.qda <- predict(pet.qda,full.dat)
     error.rate <- 1 - sum(pred.y.qda$class == full.y)/nrow(full.dat)
     error.rate.pred.qda <- c(error.rate.pred.qda,error.rate)
       
    
    print(i)
  }
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
if(params$rerun.errors){
  error.rate.lme.yr <- NULL
  for(i in 1:8){
    # Subset the data
    sub.samp <- full.dat %>% 
      filter(Genet %in% sample(data.env$full.data$Genet, size = 350))
    n <- nrow(sub.samp)
    y <- as.factor(sub.samp$Petiole.Gall > 0)
    
    # lme with genet as random effect
    pet.lme.yr <- glmer(data = sub.samp, family = binomial,
                 formula = y ~ PG + CT +
                           Vol + (1|Genet) + (1|Survey.Year)
                 )
    ### Predictions
    yhat.lme.yr <- predict(pet.lme.yr)
    error.rate <- 1 - sum((yhat.lme.yr > .05) == y)/n
    error.rate.lme.yr <- c(error.rate.lme.yr,error.rate)
    ##---
    print(i)
  }  
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
if(params$rerun.errors){
  error.rate.big <- NULL
  # for(i in 1:4){# big glm
    system.time(advanced.glm <- glmer(data = sub.samp, 
                                      family = binomial,
                            # control = glmerControl(optimizer = "bobyqa"),
                            formula = factor(Petiole.Gall > 0) ~ 
                            CT + PG + Vol +
                            BBreakDegDay + survey.month +
                            Survey.Year + (1|Genet)
                          ))
    ### Predictions
    yhat.big <- predict(advanced.glm, type = "response")
    error.rate <- 1 - (sum((yhat.big > .05) == 
                            y))/n
    error.rate.big <- c(error.rate.big,error.rate)
    # error.absent <- 1 - sum((yhat.big[which(full.y == FALSE)] > 0) == full.y[which(full.y == FALSE)])/length(full.y[which(full.y == FALSE)])
    # error.present <- 1 - sum((yhat.big[which(full.y == TRUE)] > 0) == full.y[which(full.y == TRUE)])/length(full.y[which(full.y == TRUE)])
    ##---
    # print(i)}
}
# summary(advanced.glm)
# boxplot(error.rate.big,error.absent,error.present)
mod.error.data <- data.frame(
  error = c(error.rate.lm, 
            error.rate.lm.gen,
            error.rate.glm,
            error.rate.glm.gen, 
            error.rate.qda,
            error.rate.pred.qda,
            error.rate.lme),
  method = rep(c("lm","lm.gen","glm","glm.gen","qda","qda.pred","glme"),each = params$error.iters)
      )

mod.error.data <- rbind(mod.error.data,data.frame(error = error.rate.lme.yr,
                                      method = "glme.yr.gen"))
mod.error.data <- rbind(mod.error.data,data.frame(error = error.rate.big,
                                      method = "glm.big"))


ggplot(data = mod.error.data, aes(x = method, y = error, fill = method)) + 
  geom_boxplot() + theme_dark()

if(params$rerun.errors){
save(list = (c("error.rate.lm", 
            "error.rate.lm.gen",
            "error.rate.glm",
            "error.rate.glm.gen", 
            "error.rate.qda",
            "error.rate.pred.qda",
            "error.rate.lme",
            "error.rate.lme.yr",
            "error.rate.big",
            # table of above
            "mod.error.data",
            # the models
            "pet.lm","pet.glm","pet.glm.gen","pet.lm.gen",
            "pet.lme","pet.lme.yr","pet.qda","advanced.glm"
            )),
     file = "data/error-rate_binomial-models.RData")
}

We can see that GLMs have a much lower over-all error rate than the other types.

I’ll test out the best ways to make GLMMs work:

Binomial GLMER tweaking

# reload the data
load("data/r-objects.RData")

# define the variables to use in the models
variables <- c(common.insects,"Vol","CT","PG","survey.month",
               "Genet", "Block","Survey.Year",
               "BBreakDegDay","EFNMean")
rand.vars <- "Genet"
y.var <- "Harmandia"

# define full data to use
full.dat <- full.data %>%
  dplyr::select(common.insects,Genet,
         Unique.ID, SerialNo, Block,
         survey.event, Survey.Year,
         Vol, PG, CT,
         BBreakDegDay, EFNMean, Min.per.Tree) %>%
  na.omit()


# define response variable
full.y <- full.dat[,y.var] > 0


## change survey event to survey month (jun/aug)
full.dat <- full.dat %>%
  mutate(survey.month = factor(gsub(survey.event,
                                    pattern = "(.*)(1.)",
                                    replacement = "\\1"))
         )
## define the fixed effects
fixed.vars <- variables[! variables %in% c(common.insects, rand.vars)]

full.formula <- formula(paste("factor(",y.var, "> 0) ~", # response
                      paste(fixed.vars, collapse = "+"), # fixefs
                      "+ (1 | Genet)")) #ranefs
if(!params$run.mods){load("data/initial-modeling-objects.RData")}
  
model.save.list <- NULL
#Full Model
## Fit the full model using glmer's defaults
if(params$run.mods){
  mod.1 <- glmer(formula = full.formula, data = full.dat, family = "binomial")
}
## print the model summary
summary(mod.1, corr = FALSE)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: factor(Harmandia > 0) ~ Vol + CT + PG + survey.month + Block +  
##     Survey.Year + BBreakDegDay + EFNMean + (1 | Genet)
##    Data: full.dat
## 
##      AIC      BIC   logLik deviance df.resid 
##   5989.3   6070.0  -2982.6   5965.3     6167 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.0869 -0.5213 -0.3802 -0.2430  4.3397 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  Genet  (Intercept) 0.3361   0.5797  
## Number of obs: 6179, groups:  Genet, 490
## 
## Fixed effects:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -5.591820   0.360329 -15.519  < 2e-16 ***
## Vol              0.013596   0.001704   7.977  1.5e-15 ***
## CT              -0.007433   0.013572  -0.548 0.583906    
## PG              -0.005372   0.014806  -0.363 0.716755    
## survey.monthjun  0.621005   0.072787   8.532  < 2e-16 ***
## Block2           0.081668   0.097574   0.837 0.402602    
## Block3           0.250067   0.096560   2.590 0.009605 ** 
## Block4           0.319777   0.096325   3.320 0.000901 ***
## Survey.Year2017 -0.873173   0.083898 -10.408  < 2e-16 ***
## BBreakDegDay     0.016120   0.001339  12.038  < 2e-16 ***
## EFNMean         -0.132759   0.093140  -1.425 0.154052    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## convergence code: 0
## Model failed to converge with max|grad| = 0.126957 (tol = 0.001, component 1)
## Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?
## Model is nearly unidentifiable: large eigenvalue ratio
##  - Rescale variables?
## use all optimizers to fit the model and compare them
# optim.1 <- allFit(mod.1)
##
# mod.select <- drop1(mod.1)

model.save.list <- c(model.save.list, "mod.1")

Model Optimization

Let’s re-optimize the model to fix any convergence issues. The steps we will take were suggested by Ben Bolker and include, 1. reduce the stopping tolerance, 2. rescale the variables, 3. Recompute gradient and Hessian, 4. Restart the fit from starting points equal to the estimated parameters from the first fit.

These steps ensure that model truly is maximizing the likelihood of the parameter values to the best of the abilities of all available algorithms and methods.

# Trouble shoot convergence issues: `?convergence()`
model.opt <- function(mod, scaled.data, strict = FALSE){
  stopifnot(isGLMM(mod) | isLMM(mod))
  # SC.DAT <- deparse(substitute(scaled.data))
  ## 1. decrease stopping tolerances
  strict_tol <- lmerControl(optCtrl=list(xtol_abs=1e-8, ftol_abs=1e-8))
  if (all(mod@optinfo$optimizer == "nloptwrap")) {
    mod <- update(mod, control=strict_tol)
  }
  
  ## fit the model to scaled data
  scale.mod <- update(mod, data = scaled.data)
  
  ## 3. recompute gradient and Hessian with Richardson extrapolation
  devfun <- update(scale.mod, devFunOnly = TRUE)
  if (isLMM(scale.mod)) {
    pars <- getME(scale.mod,"theta")
  } else {
    ## GLMM: requires both random and fixed parameters
    pars <- getME(scale.mod, c("theta","fixef"))
  }
  if (require("numDeriv")) {
    hess <- hessian(devfun, unlist(pars))
    grad <- grad(devfun, unlist(pars))
    scgrad <- solve(chol(hess), grad)
  }
  ## compare with internal calculations:
  # scale.mod@optinfo$derivs # looked nearly identical
  
  ## 4. restart the fit from the original value (or
  ## a slightly perturbed value):
  scale.mod.restart <- update(scale.mod, start = pars)
  if(strict){
    restart.strict <- update(scale.mod, start = pars, control = strict_tol)
  } else{restart.strict <- NULL}
  # set.seed(101)
  # pars_x <- runif(length(pars),pars/1.01,pars*1.01)
  # scale.mod.restart2 <- update(scale.mod, start=pars_x,
  #                        control = strict_tol)
  return(list(scale.mod, scale.mod.restart, pars,mod.tol = mod, restart.strict))
}

model.save.list <- c(model.save.list,"model.opt")
## rescale Data
## in order to fix the eigenvalue warning, I'm going to rescale the data
scaled.dat <- transform(full.dat,
                        Vol = scale(Vol^(1/3)),
                        CT = scale(CT),
                        PG = scale(PG),
                        BBreakDegDay = scale(BBreakDegDay),
                        EFNMean = scale(EFNMean)
                        )

if(params$run.mods){
  pres.mod.opt <- model.opt(mod = mod.1,scaled.data = scaled.dat)
}
mod.scl.1 <- pres.mod.opt[[1]]
mod.scl.1.restart <- pres.mod.opt[[2]]

summary(mod.scl.1.restart, corr = FALSE)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: factor(Harmandia > 0) ~ Vol + CT + PG + survey.month + Block +  
##     Survey.Year + BBreakDegDay + EFNMean + (1 | Genet)
##    Data: scaled.data
## 
##      AIC      BIC   logLik deviance df.resid 
##   5961.3   6042.0  -2968.6   5937.3     6167 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.1850 -0.5192 -0.3756 -0.2350  4.9215 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  Genet  (Intercept) 0.3425   0.5852  
## Number of obs: 6179, groups:  Genet, 490
## 
## Fixed effects:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -1.486162   0.093294 -15.930  < 2e-16 ***
## Vol              0.449109   0.048158   9.326  < 2e-16 ***
## CT               0.016266   0.044503   0.365  0.71474    
## PG               0.007611   0.044304   0.172  0.86361    
## survey.monthjun  0.597560   0.073190   8.165 3.23e-16 ***
## Block2           0.070013   0.097842   0.716  0.47426    
## Block3           0.240398   0.096738   2.485  0.01295 *  
## Block4           0.311018   0.096593   3.220  0.00128 ** 
## Survey.Year2017 -0.946862   0.084800 -11.166  < 2e-16 ***
## BBreakDegDay     0.546381   0.043906  12.444  < 2e-16 ***
## EFNMean         -0.048123   0.039981  -1.204  0.22873    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model.save.list <- c(model.save.list,"pres.mod.opt")

The interpretation of this model is different than the above model because the data have been rescaled. This model has converged and has stable parameters With approximate confidence intervals. Bootstrapping could be done to improve the accuracy of the confidence intervals but this will be very computationally expensive.

For now, we’ll use this re-optimized model that uses scaled data.

Residual Plot

ggplot(data = NULL, aes(y = resid(mod.scl.1), 
                             x = fitted(mod.scl.1.restart)))+
  geom_point() + geom_smooth(method = "auto")

The residual plot from our model seems reasonable for a logistic model.

Prediction accuracy

Here we’ll show the categorization error rates of the model. That is, what proportion of observations does the model correctly predict?

## Check how well it predicts things
y.hat <- (predict(mod.scl.1, type = "response") >= .5)

error.all <- 1 - sum(y.hat == full.y)/nrow(full.dat)
error.true <- 1 - sum(y.hat[full.y] ==
                        full.y[full.y])/length(full.y[full.y])
error.false <- 1 - sum(y.hat[!full.y] ==
                         full.y[!full.y])/length(full.y[!full.y])
## print the error rates of our model
errors <- data.frame(value = c(error.all,error.true,error.false),
           type = c("overall error", "'true' error", "'false' error"))

ggplot(data = errors, aes(x = type, y = value,col = type, 
                          fill = type)) +
  geom_bar(stat = "identity")+
  theme(axis.text.x = element_blank())+
  labs(y = "error rate")

We see that our model has a very low over-all error rate. It correctly predicts the true state of our variable 79.24% of the time. However, we can see that the model performs poorly among observations whose true sate was TRUE (i.e. the insect was found on the tree). This group is only predicted correctly 10.81% of the time. This problem occurs because there are far more observations where the insect was not present as shown here:

ggplot(full.dat, aes(x = Unique.ID,
                     y = jitter(as.numeric(get(y.var) > 0))))+
  geom_point(aes(col = (get(y.var) > 0))) +
  facet_wrap(~ survey.event)

This means that for every combination of covariates, the model has far fewer training data for TRUE than it did for FALSE. It is possible to tweak this prediction by shifting the classification decision boundary (currently at prob = .05) but this will come at the loss of predictive power for FALSE group. It comes down to the question of “what prediction is most important?” It seems to me that we are much more interested in what prevents insects from being present and less about when they will be present, especially since there is a strong random component to this.

Let’s look at this further by shifting the classification boundary to 0.3:

## Check how well it predicts things
y.hat.s <- (predict(mod.scl.1, type = "response") >=.3)

error.all.s<- 1 - sum(y.hat.s == full.y)/nrow(full.dat)
error.true.s <- 1 - sum(y.hat.s[full.y] ==
                        full.y[full.y])/length(full.y[full.y])
error.false.s <- 1 - sum(y.hat.s[!full.y] ==
                         full.y[!full.y])/length(full.y[!full.y])
## print the error rates of our model
errors.s <- data.frame(value = c(error.all.s,error.true.s,error.false.s),
           type = c("overall error", "'true' error", "'false' error"))

ggplot(data = errors.s, aes(x = type, y = value,col = type, 
                          fill = type)) +
  geom_bar(stat = "identity")+
  theme(axis.text.x = element_blank())+
  labs(y = "error rate")

Now we’ve reduced our error rate in the truly present group. Now we correctly predict presence of the insect 48.7% of the time but we’ve reduced our predictive power for the other group. We now classify accurately 85.16, 79.24% of the time for the absent class and all samples respectively.

Here we further visualize the error rates:

# layout(matrix())

plot((predict(mod.scl.1.restart, type = "response")) ~
       jitter(as.numeric(full.y),amount = .45),
     ylab = "predicted presence probability",
     xlab = "observed presence");abline(h = c(0.5,0.3), 
                                        v = 0.5, 
                                        col = c("red","green","red"))

plot(jitter(as.numeric(y.hat),amount = .45) ~
       jitter(as.numeric(full.y),amount = .45),
     ylab = "predicted presence",
     xlab = "observed presence",
     main = "classification boundary at 0.5");abline(h = .5, v = 0.5, col = "red")

plot(jitter(as.numeric(predict(mod.scl.1.restart, type = "response") > .3),
            amount = .25) ~
       jitter(as.numeric(full.y),amount = .45),
     ylab = "predicted presence",
     xlab = "observed presence",
     main = "classification boundary at 0.3");abline(h = .3, v = 0.5, col = "red")

caption <- "predicted presence vs. observed presence: Both plots show the
             predicted presence paired with the observed presence. The x-axis 
             in both plots shows a spread out version of the observed presence
             (1 = present, 2 = absent) of the insect. The y-axis of the top
             plot displays the predicted probability that an observation belongs
             to the 'present' class whereas the y-axis of the next 2 plots show
             the predicted class, using a decision boundary cut-off of 0.5 and 0.3              respectively."

predicted presence vs. observed presence: the above plots show the predicted presence paired with the observed presence. The x-axis in both plots shows a spread out version of the observed presence (1 = present, 2 = absent) of the insect. The y-axis of the top plot displays the predicted probability that an observation belongs to the ‘present’ class whereas the y-axis of the next 2 plots show the predicted class, using a decision boundary cut-off of 0.5 and 0.3 respectively.

The plots above can give us a visual representation of the error rates mentioned previously. Points in the lower-left and upper-right quadrants are observations whose class the model correctly predicts while the upper-left and lower-right quadrants show observations whose class were predicted incorrectly by the model.

We can see that our 3rd plot makes more liberal classifications concerning presence. We can interpret this new cut-off with the following statement. “We will predict a tree as having if the probability of it being present is \(\geq0.3\) and we will predict that a tree does not have the insect if the probability of the insect being present is \(\leq0.7\).”

More random effects

scaled.data <- scaled.dat

if(params$run.mods){
  # time as random effects
  rand.time.mod <- update(mod.scl.1, 
                           formula = . ~ Vol + CT + PG + Block +
                            EFNMean + BBreakDegDay + 
                             (1 | Genet) + (1 | Survey.Year/survey.month))
  ##optimize
  rand.opt <- model.opt(rand.time.mod, scaled.data = scaled.data)
  
  
  # time and block as random
  rand.time.block.mod <- update(mod.scl.1, 
                           formula = . ~ Vol + CT + PG +
                             EFNMean + BBreakDegDay + 
                             (1 | Genet) + (1 | Survey.Year/survey.month) + 
                             (1 | Block))
  ## optimize
  rand.tb.opt <- model.opt(rand.time.block.mod, scaled.data = scaled.data)
}


rand.time.restart <- rand.opt[[2]]
rand.time.block.restart <- rand.tb.opt[[2]]

anova(rand.time.mod,
      rand.time.block.mod,
      rand.time.restart,
      rand.time.block.restart,
      mod.scl.1.restart)
## Data: scaled.data
## Models:
## rand.time.block.mod: factor(Harmandia > 0) ~ Vol + CT + PG + EFNMean + BBreakDegDay + 
## rand.time.block.mod:     (1 | Genet) + (1 | Survey.Year/survey.month) + (1 | Block)
## rand.time.block.restart: factor(Harmandia > 0) ~ Vol + CT + PG + EFNMean + BBreakDegDay + 
## rand.time.block.restart:     (1 | Genet) + (1 | Survey.Year/survey.month) + (1 | Block)
## rand.time.mod: factor(Harmandia > 0) ~ Vol + CT + PG + Block + EFNMean + BBreakDegDay + 
## rand.time.mod:     (1 | Genet) + (1 | Survey.Year/survey.month)
## rand.time.restart: factor(Harmandia > 0) ~ Vol + CT + PG + Block + EFNMean + BBreakDegDay + 
## rand.time.restart:     (1 | Genet) + (1 | Survey.Year/survey.month)
## mod.scl.1.restart: factor(Harmandia > 0) ~ Vol + CT + PG + survey.month + Block + 
## mod.scl.1.restart:     Survey.Year + BBreakDegDay + EFNMean + (1 | Genet)
##                         Df    AIC    BIC  logLik deviance   Chisq Chi Df
## rand.time.block.mod     10 5981.6 6048.9 -2980.8   5961.6               
## rand.time.block.restart 10 5981.6 6048.9 -2980.8   5961.6  0.0000      0
## rand.time.mod           12 5977.9 6058.6 -2976.9   5953.9  7.6694      2
## rand.time.restart       12 5977.9 6058.6 -2976.9   5953.9  0.0000      0
## mod.scl.1.restart       12 5961.3 6042.0 -2968.6   5937.3 16.6111      0
##                         Pr(>Chisq)    
## rand.time.block.mod                   
## rand.time.block.restart    < 2e-16 ***
## rand.time.mod              0.02161 *  
## rand.time.restart          < 2e-16 ***
## mod.scl.1.restart          < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model.save.list <- c(model.save.list, "rand.time.mod","rand.opt",
                     "rand.time.block.mod","rand.tb.opt")

After optimizing the models with random block and time effects, we can see that the model with only genet as a random effect fits the data best (lowest AIC and largest log-likelihood).

Important model components of best model

summary(mod.scl.1.restart, corr = FALSE)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: factor(Harmandia > 0) ~ Vol + CT + PG + survey.month + Block +  
##     Survey.Year + BBreakDegDay + EFNMean + (1 | Genet)
##    Data: scaled.data
## 
##      AIC      BIC   logLik deviance df.resid 
##   5961.3   6042.0  -2968.6   5937.3     6167 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.1850 -0.5192 -0.3756 -0.2350  4.9215 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  Genet  (Intercept) 0.3425   0.5852  
## Number of obs: 6179, groups:  Genet, 490
## 
## Fixed effects:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -1.486162   0.093294 -15.930  < 2e-16 ***
## Vol              0.449109   0.048158   9.326  < 2e-16 ***
## CT               0.016266   0.044503   0.365  0.71474    
## PG               0.007611   0.044304   0.172  0.86361    
## survey.monthjun  0.597560   0.073190   8.165 3.23e-16 ***
## Block2           0.070013   0.097842   0.716  0.47426    
## Block3           0.240398   0.096738   2.485  0.01295 *  
## Block4           0.311018   0.096593   3.220  0.00128 ** 
## Survey.Year2017 -0.946862   0.084800 -11.166  < 2e-16 ***
## BBreakDegDay     0.546381   0.043906  12.444  < 2e-16 ***
## EFNMean         -0.048123   0.039981  -1.204  0.22873    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(mod.scl.1.restart)
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: factor(Harmandia > 0)
##                 Chisq Df Pr(>Chisq)    
## Vol           86.9691  1  < 2.2e-16 ***
## CT             0.1336  1   0.714741    
## PG             0.0295  1   0.863606    
## survey.month  66.6597  1  3.227e-16 ***
## Block         13.3987  3   0.003849 ** 
## Survey.Year  124.6747  1  < 2.2e-16 ***
## BBreakDegDay 154.8651  1  < 2.2e-16 ***
## EFNMean        1.4488  1   0.228726    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Insect abundance models

Now we’ll model the abundance of Harmandia given that it is present on a tree.

Poisson Regression

First, I’m going to try poisson regression

Note that, because our response variable has no 0s and poisson does, we’ll subtract 1 from the response variable in the model.

# only rows where the insect was found
abun.dat <- full.dat %>% filter(get(y.var) > 0) %>% na.omit()
# modify the full formula for the continuous case
abun.formula <- update(full.formula, get(y.var) - 1 ~ .)

# rescale the numeric variables
scaled.data <- abun.dat.scl <- transform(abun.dat,
                        Vol = scale(Vol^(1/3)),
                        CT = scale(CT),
                        PG = scale(PG),
                        BBreakDegDay = scale(BBreakDegDay),
                        EFNMean = scale(EFNMean)   
                        )

if(params$run.mods){
  # build the abundance model
  abun.mod <- glmer(formula = abun.formula, family = "poisson", data = abun.dat, offset = offset(Min.per.Tree))
  # optimize the models
  abun.mod.opt <- model.opt(mod = abun.mod, scaled.dat = scaled.data)
}

abun.mod.scl <- abun.mod.opt[[1]]
abun.mod.scl.restart <- abun.mod.opt[[2]]

model.save.list <- c(model.save.list,"abun.mod","abun.mod.opt")
summary(abun.mod.scl, corr = FALSE)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: 
## get(y.var) - 1 ~ Vol + CT + PG + survey.month + Block + Survey.Year +  
##     BBreakDegDay + EFNMean + (1 | Genet)
##    Data: scaled.data
##  Offset: offset(Min.per.Tree)
## 
##      AIC      BIC   logLik deviance df.resid 
##   8183.9   8246.3  -4079.9   8159.9     1329 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
##  -4.354  -0.867  -0.294   1.109 281.720 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  Genet  (Intercept) 2.115    1.454   
## Number of obs: 1341, groups:  Genet, 397
## 
## Fixed effects:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -7.12494    0.11294 -63.087  < 2e-16 ***
## Vol             -1.71118    0.04858 -35.222  < 2e-16 ***
## CT              -0.10675    0.03944  -2.707 0.006797 ** 
## PG              -0.10402    0.04416  -2.356 0.018493 *  
## survey.monthjun  0.54951    0.05023  10.939  < 2e-16 ***
## Block2           0.35539    0.08284   4.290 1.78e-05 ***
## Block3           0.40968    0.07996   5.124 3.00e-07 ***
## Block4           0.45346    0.08140   5.571 2.53e-08 ***
## Survey.Year2017  0.18016    0.07246   2.486 0.012903 *  
## BBreakDegDay     0.10770    0.05334   2.019 0.043477 *  
## EFNMean          0.14820    0.03894   3.806 0.000141 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## convergence code: 0
## Model failed to converge with max|grad| = 0.0844753 (tol = 0.001, component 1)
summary(abun.mod.scl.restart, corr = FALSE)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: 
## get(y.var) - 1 ~ Vol + CT + PG + survey.month + Block + Survey.Year +  
##     BBreakDegDay + EFNMean + (1 | Genet)
##    Data: scaled.data
##  Offset: offset(Min.per.Tree)
## 
##      AIC      BIC   logLik deviance df.resid 
##   8183.9   8246.3  -4079.9   8159.9     1329 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
##  -4.354  -0.866  -0.294   1.109 281.625 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  Genet  (Intercept) 2.117    1.455   
## Number of obs: 1341, groups:  Genet, 397
## 
## Fixed effects:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -7.12466    0.11296 -63.071  < 2e-16 ***
## Vol             -1.71209    0.04859 -35.237  < 2e-16 ***
## CT              -0.10770    0.03944  -2.730 0.006324 ** 
## PG              -0.10452    0.04417  -2.367 0.017956 *  
## survey.monthjun  0.55044    0.05024  10.956  < 2e-16 ***
## Block2           0.35576    0.08283   4.295 1.75e-05 ***
## Block3           0.40775    0.07995   5.100 3.40e-07 ***
## Block4           0.45138    0.08139   5.546 2.93e-08 ***
## Survey.Year2017  0.17913    0.07246   2.472 0.013435 *  
## BBreakDegDay     0.10694    0.05335   2.004 0.045030 *  
## EFNMean          0.15040    0.03894   3.862 0.000112 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(abun.mod)
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: get(y.var) - 1
##                 Chisq Df Pr(>Chisq)    
## Vol          652.1050  1  < 2.2e-16 ***
## CT             0.6599  1  0.4165886    
## PG             5.6382  1  0.0175737 *  
## survey.month 102.1630  1  < 2.2e-16 ***
## Block         30.5272  3  1.069e-06 ***
## Survey.Year    6.1987  1  0.0127842 *  
## BBreakDegDay  12.5193  1  0.0004028 ***
## EFNMean       20.3219  1  6.545e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(abun.mod.scl)
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: get(y.var) - 1
##                  Chisq Df Pr(>Chisq)    
## Vol          1240.5701  1  < 2.2e-16 ***
## CT              7.3259  1  0.0067970 ** 
## PG              5.5488  1  0.0184934 *  
## survey.month  119.6644  1  < 2.2e-16 ***
## Block          37.3030  3   3.97e-08 ***
## Survey.Year     6.1823  1  0.0129034 *  
## BBreakDegDay    4.0768  1  0.0434769 *  
## EFNMean        14.4834  1  0.0001414 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(abun.mod.scl.restart)
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: get(y.var) - 1
##                  Chisq Df Pr(>Chisq)    
## Vol          1241.6691  1  < 2.2e-16 ***
## CT              7.4555  1  0.0063244 ** 
## PG              5.6004  1  0.0179564 *  
## survey.month  120.0307  1  < 2.2e-16 ***
## Block          37.0197  3  4.557e-08 ***
## Survey.Year     6.1109  1  0.0134348 *  
## BBreakDegDay    4.0175  1  0.0450297 *  
## EFNMean        14.9146  1  0.0001125 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(abun.mod.scl, abun.mod.scl.restart)
## Data: scaled.data
## Models:
## abun.mod.scl: get(y.var) - 1 ~ Vol + CT + PG + survey.month + Block + Survey.Year + 
## abun.mod.scl:     BBreakDegDay + EFNMean + (1 | Genet)
## abun.mod.scl.restart: get(y.var) - 1 ~ Vol + CT + PG + survey.month + Block + Survey.Year + 
## abun.mod.scl.restart:     BBreakDegDay + EFNMean + (1 | Genet)
##                      Df    AIC    BIC  logLik deviance  Chisq Chi Df
## abun.mod.scl         12 8183.9 8246.3 -4079.9   8159.9              
## abun.mod.scl.restart 12 8183.9 8246.3 -4079.9   8159.9 0.0076      0
##                      Pr(>Chisq)    
## abun.mod.scl                       
## abun.mod.scl.restart  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Model Diagnostics

# pearson residuals by fitted value
# plot(abun.mod.scl.restart)

# extract pearson residuals
resid.pois <- residuals(abun.mod.scl.restart, type = "deviance")
# look at a qqplot of the residuals

qqnorm(resid.pois, main = "QQplot - full");qqline(resid.pois, col = "red")

qqnorm(resid.pois,ylim = c(-5,50),
       main = "QQplot - zoomed");qqline(resid.pois, col = "red")

plot(abun.dat.scl$Vol, resid.pois, xlab = "vol^(1/3)",ylab = "deviance residuals")

# plot(abun.dat.scl$fitted, resid.pois);
# abline(h = 0, col = "red")

abun.dat.scl$preds <- predict(abun.mod.scl.restart)

# plot(fitted ~ preds, data = abun.dat.scl)

Predictions

Next we’ll see how our model performs prediction-wise

(MSE<- mean(residuals(abun.mod.scl.restart)^2))
## [1] 3.503099

The model has an MSE (Which is the variance of our estimator) of 3.5. Another way of thinking about this is that \(\sqrt{MSE}\) is essentially the standard deviation of the estimator which, in this case is 1.87

Next we’ll look at the distribution of our data and of the fitted values from the model. We assume that the count data comes from a Poisson model with a center parameter equal to the mean # of individuals observed.

lamb = round(mean(abun.dat.scl[,y.var] - 1))

hist(abun.dat.scl[,y.var] - 1, probability = TRUE, breaks = 20,
     col = rgb(1,0,0,.4),
     ylim = c(0,max(dpois(0:20,lamb)+.1)),
     main = "Observed Data",
     xlab = paste(y.var,"Abundance - 1"))

lines(y = dpois(0:20,lambda = lamb),x = 0:20, col = "red",lwd = 1.5)
legend(x = "topright",legend = paste0("dpois(lambda=",lamb,")"),col = "red",lwd = 1.5)

# curve(from = 0, to = 20,dpois(x, lambda = lamb),add = TRUE, col = "red")

hist(fitted(abun.mod.scl.restart),probability = TRUE,
     col = rgb(0,0,1,.4), add = FALSE,
     ylim = c(0,max(dpois(0:20,lamb)+.1)),
     main = "Fitted Values",
     xlab = paste("Fitted",y.var,"Abundance"))

lines(y = dpois(0:20,lambda = lamb),x = 0:20, col = "red",lwd = 1.5)

legend(x = "topright",legend = paste0("dpois(lambda=",lamb,")"),col = "red",lwd = 1.5)

# curve(from = 0, to = 20,dpois(x, lambda = lamb),add = TRUE, col = "red")

Both our observed data and the predicted data seem to fit same poisson distribution quite well. It should be noted that, because the data have no zeros, the model is predicting zeros. To account for this, shifting the fitted values up 1 seems to be effective.

abun.dat.scl$fitted <- fitted(abun.mod.scl.restart) + 1

ggplot(data = abun.dat.scl, aes(x = Min.per.Tree, y = get(y.var)))+
  geom_point()+
  geom_point(aes(y = fitted, x = Min.per.Tree + .2), col = "red")

fit.mod <- lm(fitted ~ get(y.var), data = abun.dat.scl)
# summary(fit.mod)

plot(fitted ~ get(y.var), data = abun.dat.scl)
abline(fit.mod, col = "red");abline(a = 0, b = 1, col = "blue", lty = "dashed")

Negative Binomial Regression

Based on diagnostic plots from the Poisson model, the negative binomial regression may fit slightly better.

# save the model objects
save(list = c(unique(model.save.list)), 
     file = "data/initial-modeling-objects.RData")