# variable inclusion table for insect-trait models
model.table <- data.table::fread("data/selected-insect-models_AIC.csv",
                                 data.table = FALSE)
# model table for trait-trait models
trait.covar.table <- data.table::fread("data/traits-with-covars-table_gwa.csv",
                                       data.table = FALSE)
# list of common insects in a logical order
insect.levels <- c("Aspen.Leaf.Beetle","Blackmine", #beetles
                   "Cotton.Scale","Leafhoppers","Casebearer.Moth", #'free-feeding, uncategorized
                   "Blotch.Mine","Cottonwood.Leaf.Mine","Lombardy.Mine", # miners
                   "Leaf.Edge.Mine","Weevil.Mine", #miners (continued)
                   "Green.Sawfly","Phyllocolpa","Pale.Green.Notodontid", #caterpillar-like
                   "Harmandia","Petiole.Gall", # gallers
                   "Smokey.Aphids","Green.Aphids","Ants") #aphids and ants
common.insects <- factor(insect.levels,levels=insect.levels)

model.table$response <- factor(model.table$response,levels = insect.levels) # convert column

# list of gwa traits
gwa.traits <- c("Hobs","BA.2012sqrt","SLA","ALA","CT","PG","Npct",
                # "Sex.TEMP", # exclude sex
                "CN","BAsqrt","Vol","EFNMean","BBreakDegDay","Flprev")
gwa.trait.lvls = c("SLA","ALA","EFNMean","BBreakDegDay", # leaf traits
                     "PG","CT","CN","Npct", # leaf chem
                     "Flprev","Sex.TEMP", # repro
                     "BAsqrt","BA.2012sqrt","Vol", # size
                     "Hobs" # other
) 

diversity.traits = c("MDS1","MDS2","MDS3","MDS4","richness","evenness","shannon.index")


if(!params$run.mods){ # load in models, if not re-running
  loaded <- load("data/null-gwa-models.RData") 
}

Define Functions

# arguments
add.snps <- function(df = NULL, # data frame to merge SNPs into
                     SNP.file = "data/additive-SNP-data.csv", #  SNP file
                     GWA.ID.col = c("GWA.ID","GWA.ID"), # gwa id col names (df, snp)
                     which.SNPs = 1:10 # which SNPs should we add?
){
  # other prarams
  SNP.N <- count.fields("data/additive-SNP-data.csv", sep = ",")[1] - 1
  
  # argument checks
  stopifnot(which.SNPs <= SNP.N & which.SNPs >0) # possible SNP
  
  # data
  if(is.null(df)){ # default df
    df <- fread("data/phenos-and-covars.csv", data.table = FALSE,
                select = c("SerialNo","Genet.SSRrev", "GWA.ID",
                           gwa.traits, common.insects, diversity.traits))
  } else {df <- as.data.frame(df)}
  #SNP data
  SNP.dat <- fread("data/additive-SNP-data.csv", select = c(1,which.SNPs + 1))
  names(SNP.dat) <- gsub(names(SNP.dat),pattern = ":",replacement = ".")
  snp.names <- names(SNP.dat)[-1]
  # merge together the df and SNP data
  nrow.og <- nrow(df)
  df <- merge(df, SNP.dat, by.x = GWA.ID.col[1], by.y = GWA.ID.col[2], 
              all.x = TRUE, all.y = FALSE)
  stopifnot(nrow(df) == nrow.og)
  
  return(list(df = df, snps = snp.names))
}

add.snps(which.SNPs = 5:7)$snps
## [1] "Potra189406.50" "Potra196383.56" "Potra196383.59"
# This function fits the model for GWA data
fit.gwa.model <- function(
  # arguments
  response = "Harmandia", # response variable
  fam = "binomial", # distribution family
  binom.condition = ">0", # binomial condition (i.e. Harmandia>0)
  genet.min = 2, # smallest sample size to use per time period
  common.fixed = c("Block","Dist.Edge"), # fixed effects that are not 'covariates'
  covars = NULL, # list of covariates
  scale.vars=TRUE, # should numeric columns be scaled, including response?
  dont.scale = c("Sex.TEMP"), # non-numeric variables (resp or covars) not to scaled
  scale.explicit = NULL,
  scale.resp = TRUE, # should the response be scaled (only matters if scale.covars)
  SNP.no = NULL, # which SNP to use
  random.term = "(1|Survey.Year/Survey.Month/Genet.SSRrev)", # random effects term for lme4
  data.file = "data/phenos-and-covars.csv", # path to the data file
  SNP.file = "data/additive-SNP-data.csv",
  verbose = 0 # print info about each model?
){
  # define response variable (for binomial distro)
  if(fam == "binomial" & !is.null(binom.condition)){
    form.resp <- paste0(response,binom.condition)# combine the response and binomial condition
  } else {form.resp <- response}
  
  # create fixed effects term
  if(length(covars)>0){ # no covariates
    fixed.effects <- paste(paste(common.fixed,collapse="+"),
                           paste(covars,collapse="+"),
                           sep = "+")
  } else { # with covariates
    fixed.effects <- paste(common.fixed,collapse="+")
  }
  # model formula
  form <- paste0(form.resp,"~",
                 paste(fixed.effects,collapse = "+"),
                 "+",random.term)
  # model variables
  mod.vars <- unique(c(all.vars(formula(form)),
                       "survey.event","Genet.SSRrev","SerialNo","GWA.ID")) #always include these
  # read data.frame
  tmp <- fread(data.file,select=mod.vars)
  # scale variables
  if(!is.null(covars) & scale.vars){
  tmp <- tmp %>% mutate_at(covars[!covars %in% dont.scale], scale) # scale covars not in dont.scale
  }
  if(scale.vars & fam != "binomial" &!response %in% dont.scale){
    tmp <- tmp %>% mutate_at(response, scale) # scale response
  }
  if(!is.null(scale.explicit)){tmp <- tmp %>% mutate_at(scale.explicit, scale)}
  # Add in SNPs, if asked
  snp.name = NULL
  if(!is.null(SNP.no)){
    snp.obj <- add.snps(tmp,which.SNPs = SNP.no)
    tmp <- snp.obj$df
    snp.name <- snp.obj$snps
    form <- paste(form,snp.name, sep = "+")
  }
  
  # filter by genet.min 
  sample.n <- tmp %>% na.omit() %>% #first remove trees with missing data to ensure proper sample n in filter step
    group_by(Genet.SSRrev,survey.event) %>%  # group by genet
    tally() %>% filter(n>=genet.min) # tally remaining individuals per genet (after removing missing data)
  tmp <- tmp %>% filter(Genet.SSRrev %in% sample.n$Genet.SSRrev)
  
  
  if(verbose > 0){ # print model info
    cat("fitting model for",as.character(response),"\n",sep = " ")
    if(verbose > 1) {
      cat(form,"\n")
    }
  }
  
  if(fam == "normal"){ # fit model
    fm <- lmer(data = tmp, formula = form, REML = FALSE) # LMM
  } else {
    fm <- glmer(data = tmp, formula = formula(form), family = fam) # GLMM
  }
  return(list("fm" = fm, "response" = response, "snp" = snp.name)) # return the model and response name
}

fit.gwa.model(SNP.no = 10)
## $fm
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## Harmandia > 0 ~ Block + Dist.Edge + (1 | Survey.Year/Survey.Month/Genet.SSRrev) +  
##     Potra001067.64
##    Data: tmp
##       AIC       BIC    logLik  deviance  df.resid 
##  5711.244  5757.567 -2848.622  5697.244      5521 
## Random effects:
##  Groups                                  Name        Std.Dev. 
##  Genet.SSRrev:(Survey.Month:Survey.Year) (Intercept) 0.6226578
##  Survey.Month:Survey.Year                (Intercept) 0.3069259
##  Survey.Year                             (Intercept) 0.0000875
## Number of obs: 5528, groups:  
## Genet.SSRrev:(Survey.Month:Survey.Year), 1520; Survey.Month:Survey.Year, 4; Survey.Year, 2
## Fixed Effects:
##    (Intercept)           Block       Dist.Edge  Potra001067.64  
##       -1.89224         0.11440         0.03925         0.07515  
## convergence code 0; 1 optimizer warnings; 0 lme4 warnings 
## 
## $response
## [1] "Harmandia"
## 
## $snp
## [1] "Potra001067.64"
fit.gwa.model("Vol",fam = "normal",SNP.no = 10)
## $fm
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: 
## Vol ~ Block + Dist.Edge + (1 | Survey.Year/Survey.Month/Genet.SSRrev) +  
##     Potra001067.64
##    Data: tmp
##       AIC       BIC    logLik  deviance  df.resid 
## 12485.542 12538.404 -6234.771 12469.542      5466 
## Random effects:
##  Groups                                  Name        Std.Dev.
##  Genet.SSRrev:(Survey.Month:Survey.Year) (Intercept) 0.6837  
##  Survey.Month:Survey.Year                (Intercept) 0.0000  
##  Survey.Year                             (Intercept) 0.4396  
##  Residual                                            0.5959  
## Number of obs: 5474, groups:  
## Genet.SSRrev:(Survey.Month:Survey.Year), 1516; Survey.Month:Survey.Year, 4; Survey.Year, 2
## Fixed Effects:
##    (Intercept)           Block       Dist.Edge  Potra001067.64  
##       0.003035       -0.072268        0.051651       -0.009995  
## convergence code 0; 1 optimizer warnings; 0 lme4 warnings 
## 
## $response
## [1] "Vol"
## 
## $snp
## [1] "Potra001067.64"
# function to conduct goodness of fit test for binomial model (Hosmer-Lemeshow)
GOF.binom <- function(
  # arguments
  gwa.fit, groups = 10, # number of bins to use for grouping
  print.table = FALSE, type = 2 # should the contingency table be printed?
){ 
  # other parameters
  fm = gwa.fit$fm # fitted model
  dat = fm@frame # data frame
  fv = fitted(fm) # fitted values
  resp = gwa.fit$response # response variable
  
  # first, split the data into quantiles
  q = quantile(fv, seq(0,1,1/groups), type = 2)
  # then group the fitted values according to quantile
  fv.g = cut(fv, breaks = q, include.lowest = TRUE)
  # contigency table for fitted values and response values
  obs = xtabs(~ fv.g + dat[,1])
  # expected values 
  fit = cbind(e.0 = tapply(1-fv, fv.g, sum), # expected values for FALSE
               e.1 = tapply(fv, fv.g, sum)) # and for TRUE
  if(print.table){print(cbind(obs,fit))}
  chi.sqr = sum((obs - fit)^2/fit) # calc chisqr stat
  pval = pchisq(chi.sqr, groups-2, lower.tail = FALSE) # calc p value
  # print the results as a data frame
  data.frame(test = "Hosmer-Lemeshow",response = resp, groups = groups, chi.sqr = chi.sqr, pvalue = pval)
}
# function to create a qqplot for LMM (only) gwa.fit objects
gwa.qqplot <- function(
  gwa.fit, # fitted gwa object
  box.prop = .95, # middle x percent of observations to show wit red box
  base.size = 8 # base text size for plot
){
  ggplot(data = gwa.fit$fm@frame, 
         aes(sample = resid(gwa.fit$fm))) + 
    # Add a rectangle to 
    geom_rect(xmin = -2, xmax = 2, 
              ymin = quantile(resid(gwa.fit$fm), 
                              probs = 1 - ((1 - box.prop)/2)
                              ),
              ymax = quantile(resid(gwa.fit$fm),
                              probs = (1 - box.prop)/2
                              ),
              col = "red", alpha = .01, fill = NA, lty = "dashed", size = .1
    ) +
    geom_qq() + geom_qq_line() +# qq plot
     theme_bw(base_size = base.size) +
    labs(title = paste("QQ:",gwa.fit$response))
}
# function to make a residual plot for the MM gwa.fit objects
gwa.resid.plot <- function(
  gwa.fit, # gwa.fit object
  CI.lvl = .95, # confidence interval for smooth
  base.size = 8 # base text size for plot
){
  ggplot(data = gwa.fit$fm@frame, 
         aes(y = residuals(gwa.fit$fm, type = "deviance"), 
             x = fitted(gwa.fit$fm))
  ) + geom_point() + theme_bw(base_size = base.size) +
    labs(title = gwa.fit$response,
         y = "resid", x = "fitted val")+
    geom_hline(yintercept = 0, lty = "dotted", size = .5, col = "black") +
    # add localized regression lines:
    geom_smooth(method =  "loess", level = CI.lvl, col = "blue")
    # geom_smooth(method = "gam", level = CI.lvl, col = "red") +
    # geom_smooth(method = "lm", level = CI.lvl, col = "green")
}
# lapply(insect.trait.models, gwa.resid.plot, CI.lvl = .98)

Insect Models w/out Covariates

if(params$run.mods){
  insect.mods <- lapply(X = common.insects,FUN = fit.gwa.model, verbose = TRUE)
  names(insect.mods) <- common.insects
}

Goodness of fit test

Because the insect models have binomial responses, we can perform a “Hosmer-Lemeshow” goodness of fit test, to check the fit of our models.

lapply(insect.mods, GOF.binom) %>% bind_rows() %>% select(response, chi.sqr, pvalue) %>% knitr::kable(digits = 4)
response chi.sqr pvalue
Aspen.Leaf.Beetle 38.5631 0.0000
Blackmine 447.6036 0.0000
Cotton.Scale 17.5987 0.0244
Leafhoppers 33.7037 0.0000
Casebearer.Moth 135.4415 0.0000
Blotch.Mine 35.9745 0.0000
Cottonwood.Leaf.Mine 213.9116 0.0000
Lombardy.Mine 106.4185 0.0000
Leaf.Edge.Mine 108.3890 0.0000
Weevil.Mine 14.9096 0.0609
Green.Sawfly 32.0219 0.0001
Phyllocolpa 463.6605 0.0000
Pale.Green.Notodontid 10.5859 0.2263
Harmandia 517.4735 0.0000
Petiole.Gall 500.5182 0.0000
Smokey.Aphids 306.0878 0.0000
Green.Aphids 84.1119 0.0000
Ants 33.5777 0.0000

We can see that for nearly all our models, the fit is very good. The Pale.Green.Notodontid and Weevil.Mine models have the worst fit, according to this metric.

Residual Plots

plots <- lapply(insect.trait.models, gwa.resid.plot)
do.call("grid.arrange", c(plots, ncol=4))

Residual plots are difficult to interpret with binomial data, but local regression of fitted values and residuals seem to be approximately correct. Ideally, the line would be horizontal with y intercept = 0 to indicate that there is no correlation between our predicted outcome and the residuals.

Insect Models with Trait Covariates

The covariates used for each model are as follows, chosen by exhaustive model selection:

# library(data.table);library(ggplot2)

ggplot(melt(model.table,id.vars=c("response","model.names")),
       aes(x = variable, y = response, fill = value)) + 
  geom_tile(col = "grey") + 
  scale_fill_manual(values = c("TRUE" = "black", "FALSE" = "white")) +
  theme(axis.text.x = element_text(angle = 35, hjust = 1)) + 
  labs(fill = NULL, title = "Selected Trait Covariates for Insect GWA models")

Goodness of Fit Test

if(params$run.mods){
  variables <- names(model.table)[!names(model.table) %in%
                                    c("response","model.names","Block","Dist.Edge",
                                      "Survey.Year","Survey.Month")]
  insect.trait.models <- list()
  for(i in 1:nrow(model.table)){
    resp = model.table[i,"response"]
    values = model.table[i,variables] %>% unlist()
    covars <- variables[values]
    insect.trait.models[[resp]] <- fit.gwa.model(response = resp, 
                                                 covars = covars, verbose = TRUE)
  }
}
lapply(insect.trait.models, FUN = GOF.binom)%>% bind_rows() %>% select(response, chi.sqr, pvalue) %>% knitr::kable(digits = 4)
response chi.sqr pvalue
Aspen.Leaf.Beetle 29.4845 0.0003
Blackmine 6.2966 0.6140
Cotton.Scale 6.8532 0.5526
Leafhoppers 3.4998 0.8992
Casebearer.Moth 16.0031 0.0423
Blotch.Mine 5.3181 0.7231
Cottonwood.Leaf.Mine 46.7989 0.0000
Lombardy.Mine 25.8806 0.0011
Leaf.Edge.Mine 20.5517 0.0084
Weevil.Mine 5.5388 0.6987
Green.Sawfly 12.3744 0.1353
Phyllocolpa 171.6580 0.0000
Pale.Green.Notodontid 4.1648 0.8420
Harmandia 61.8971 0.0000
Petiole.Gall 177.5860 0.0000
Smokey.Aphids 132.3322 0.0000
Green.Aphids 61.7514 0.0000
Ants 6.5261 0.5885

Interestingly, the goodness of fit test says that far fewer of or models fit the data well when covariates are included. This is especially interesting, considering that we selected the absolute best models for each insect. However, model selection was done on ‘greater than average’ condition instead of the ‘greater than 0’ condition. However, we’ve shown that these 2 conditions are virtually identical except in the case of Green.Aphids. Perhaps I should re-run the entire pipe-line, including model selection, to be sure.

Based on the residual plots below, I’m not convinced that the goodness of fit test yields good results for this model. I don’t believe it is correctly accounting for the multiple regression and mixed effects. The degrees of freedom are difficult to define in GLMMs and therefore a chi-sqr test may not be a very meaningful test here.

Residual Plots

plots <- lapply(insect.trait.models, gwa.resid.plot)
do.call("grid.arrange", c(plots, ncol=4))

Though the goodness of fit test showed less than ideal results, these residual plots look no worse than those models without covariates. This suggests that The models actually fit our data pretty well.

Trait Models

if(params$run.mods){
  normal.trait.mods <- lapply(X = gwa.traits, FUN = fit.gwa.model, fam = "normal",verbose = TRUE,
                              common.fixed = c("Block","Dist.Edge","Age"))
  names(normal.trait.mods) <- gwa.traits
}

QQ Plots

Because (most) of the trait data are normal, we can use QQ plots to assess fit.

plots <- lapply(normal.trait.mods,gwa.qqplot)
do.call("grid.arrange", c(plots, ncol=4))

For the most part, our QQ plots also look good. nearly all of our traits’ residuals follow the normal distribution. This is especially true when you consider that the red box encircles 95% of the data and that the deviations from the 1:1 line are almost exclusively in the 2.5% tails.

Residual Plots

plots <- lapply(normal.trait.mods, gwa.resid.plot)
do.call("grid.arrange", c(plots, ncol = 4))

The only residual plots that don’t look quite right are initial BA and Volume, to a lesser degree. Since I’m not really interested in how initial size is affected by genotype, I’m not too worried about this.

Trait models with Trait Covariates

trait.covar.table$form <- gsub(trait.covar.table$form, pattern = "\\\\",replacement = "")

if(params$run.mods){
  variables <- c(gwa.traits,"Sex.TEMP")
    
  trait.covar.models <- list()
  for(i in 1:nrow(trait.covar.table)){
    resp = trait.covar.table[i,"trait"]
    # values = model.table[i,variables] %>% unlist()
    mod.fam <- trait.covar.table[i,"family"]
    mod.vars <- all.vars(formula(trait.covar.table$form[i]))
    covars <- mod.vars[!mod.vars %in%
                         c(resp,"Block","Dist.Edge",
                                  "Age", "Survey.Year","Survey.Month",
                                  "Genet.SSRrev","survey.event")]
    # covars <- mod.vars[mod.vars %in% gwa.traits & !mod.vars %in% resp]
    binom.cond <- ifelse(resp == "Sex.TEMP","=='M'",">0")
    cat("resp =", resp, "\n")
    cat("covars =", covars,"\n")
    trait.covar.models[[resp]] <- fit.gwa.model(response = resp, 
                                                fam = mod.fam, 
                                                binom.condition = binom.cond,
                                                common.fixed = c("Block","Dist.Edge",
                                                                 "Age"), 
                                                scale.vars = TRUE,
                                                 covars = covars, 
                                                verbose = TRUE)
  }
}

QQ Plots

plots <- lapply(trait.covar.models, gwa.qqplot)
do.call("grid.arrange", c(plots, ncol=4))

Residual Plots

plots <- lapply(trait.covar.models, gwa.resid.plot)
do.call("grid.arrange", c(plots, ncol=4))

Check SNP values

set.seed(224479)
SNP.N <- (count.fields("data/additive-SNP-data.csv", sep = ",")[1] - 1)
rand.insect <- sample(common.insects, size = 1)
rand.trait <- sample(gwa.traits[!gwa.traits %in% c("Flprev","Sex.TEMP")], size = 1)
rand.snp <- sample(1:SNP.N,size = 1)

# results data
rand.insect.gwa.path <- file.path("data/master-gwa-results",
                                  paste("InsectGWA",rand.insect,
                                        "master_gwa-results.csv",
                                        sep = "_"))
insect.df <- fread(rand.insect.gwa.path)
insect.df$SNP.name <- gsub(insect.df$SNP.name, pattern = ":",replacement = ".")

insect.rand.sig.snp <- insect.df %>% filter(pval.chisqr <= .05) %>% select(SNP.no) %>% 
  unlist() %>% unname() %>% sample(size = 1)

rand.trait.gwa.path <- file.path("data/master-gwa-results",
                                 paste("TraitGWA",rand.trait,
                                       "master_gwa-results.csv", 
                                       sep = "_"))
trait.df <- fread(rand.trait.gwa.path)
trait.df$SNP.name <- gsub(trait.df$SNP.name, pattern = ":",replacement = ".")
trait.rand.sig.snp <- trait.df %>% filter(pval.chisqr <= .05) %>% select(SNP.no) %>% 
  unlist() %>% unname() %>% sample(size = 1)

# models
fm.1 <- fit.gwa.model(response = rand.insect, fam = "binomial",binom.condition = ">0",
              genet.min = 2, common.fixed = c("Block","Dist.Edge"),
              random.term = "(1|Survey.Year/Survey.Month/Genet.SSRrev)",
              scale.vars = TRUE, dont.scale = "Harmandia", 
              SNP.no = rand.snp, scale.explicit = c("Dist.Edge")
              )
fm.sig.1 <- fit.gwa.model(response = rand.insect, fam = "binomial",binom.condition = ">0",
              genet.min = 2, common.fixed = c("Block","Dist.Edge"),
              random.term = "(1|Survey.Year/Survey.Month/Genet.SSRrev)",
              scale.vars = TRUE, dont.scale = "Harmandia", 
              SNP.no = insect.rand.sig.snp, scale.explicit = c("Dist.Edge")
              )

fm.2 <- fit.gwa.model(response = rand.trait, fam = "normal", genet.min = 2,
                      common.fixed = c("Block","Dist.Edge","Age"),
                      scale.vars = TRUE,
                      SNP.no = rand.snp, scale.explicit = c("Dist.Edge"))

fm.sig.2 <- fit.gwa.model(response = rand.trait, fam = "normal", genet.min = 2,
                      common.fixed = c("Block","Dist.Edge","Age"),
                      scale.vars = TRUE, 
                      SNP.no = trait.rand.sig.snp, scale.explicit = c("Dist.Edge"))

# compare with current gwa function



fixef(fm.1$fm)
##      (Intercept)            Block        Dist.Edge Potra001586.2395 
##        0.3934001       -0.3154390        0.4062219        0.2415222
car::Anova(fm.1$fm)
insect.df[match(fm.1$snp, insect.df$SNP.name),]
fixef(fm.sig.1$fm)
##       (Intercept)             Block         Dist.Edge Potra003937.38342 
##         0.4515095        -0.3157512         0.4054067        -0.1932408
car::Anova(fm.sig.1$fm)
insect.df[match(fm.sig.1$snp, insect.df$SNP.name),]
fixef(fm.2$fm)
##      (Intercept)            Block        Dist.Edge              Age 
##      -1.19131497       0.01334985       0.12974246       0.19760628 
## Potra001586.2395 
##      -0.06748869
car::Anova(fm.2$fm)
trait.df[match(fm.2$snp, trait.df$SNP.name)]
fixef(fm.sig.2$fm)
##      (Intercept)            Block        Dist.Edge              Age 
##      -1.18102974       0.01388521       0.13048902       0.19891308 
## Potra173032.8857 
##      -0.11861115
car::Anova(fm.sig.2$fm)
trait.df[match(fm.sig.2$snp, trait.df$SNP.name)]
if(params$save){
  save(insect.mods,normal.trait.mods,insect.trait.models,trait.covar.models,
       file = "data/null-gwa-models.RData")
}