# variable inclusion table for insect-trait models
model.table <- data.table::fread("data/selected-insect-models_AIC.csv",
data.table = FALSE)
# model table for trait-trait models
trait.covar.table <- data.table::fread("data/traits-with-covars-table_gwa.csv",
data.table = FALSE)
# list of common insects in a logical order
insect.levels <- c("Aspen.Leaf.Beetle","Blackmine", #beetles
"Cotton.Scale","Leafhoppers","Casebearer.Moth", #'free-feeding, uncategorized
"Blotch.Mine","Cottonwood.Leaf.Mine","Lombardy.Mine", # miners
"Leaf.Edge.Mine","Weevil.Mine", #miners (continued)
"Green.Sawfly","Phyllocolpa","Pale.Green.Notodontid", #caterpillar-like
"Harmandia","Petiole.Gall", # gallers
"Smokey.Aphids","Green.Aphids","Ants") #aphids and ants
common.insects <- factor(insect.levels,levels=insect.levels)
model.table$response <- factor(model.table$response,levels = insect.levels) # convert column
# list of gwa traits
gwa.traits <- c("Hobs","BA.2012sqrt","SLA","ALA","CT","PG","Npct",
# "Sex.TEMP", # exclude sex
"CN","BAsqrt","Vol","EFNMean","BBreakDegDay","Flprev")
gwa.trait.lvls = c("SLA","ALA","EFNMean","BBreakDegDay", # leaf traits
"PG","CT","CN","Npct", # leaf chem
"Flprev","Sex.TEMP", # repro
"BAsqrt","BA.2012sqrt","Vol", # size
"Hobs" # other
)
diversity.traits = c("MDS1","MDS2","MDS3","MDS4","richness","evenness","shannon.index")
if(!params$run.mods){ # load in models, if not re-running
loaded <- load("data/null-gwa-models.RData")
}
# arguments
add.snps <- function(df = NULL, # data frame to merge SNPs into
SNP.file = "data/additive-SNP-data.csv", # SNP file
GWA.ID.col = c("GWA.ID","GWA.ID"), # gwa id col names (df, snp)
which.SNPs = 1:10 # which SNPs should we add?
){
# other prarams
SNP.N <- count.fields("data/additive-SNP-data.csv", sep = ",")[1] - 1
# argument checks
stopifnot(which.SNPs <= SNP.N & which.SNPs >0) # possible SNP
# data
if(is.null(df)){ # default df
df <- fread("data/phenos-and-covars.csv", data.table = FALSE,
select = c("SerialNo","Genet.SSRrev", "GWA.ID",
gwa.traits, common.insects, diversity.traits))
} else {df <- as.data.frame(df)}
#SNP data
SNP.dat <- fread("data/additive-SNP-data.csv", select = c(1,which.SNPs + 1))
names(SNP.dat) <- gsub(names(SNP.dat),pattern = ":",replacement = ".")
snp.names <- names(SNP.dat)[-1]
# merge together the df and SNP data
nrow.og <- nrow(df)
df <- merge(df, SNP.dat, by.x = GWA.ID.col[1], by.y = GWA.ID.col[2],
all.x = TRUE, all.y = FALSE)
stopifnot(nrow(df) == nrow.og)
return(list(df = df, snps = snp.names))
}
add.snps(which.SNPs = 5:7)$snps
## [1] "Potra189406.50" "Potra196383.56" "Potra196383.59"
# This function fits the model for GWA data
fit.gwa.model <- function(
# arguments
response = "Harmandia", # response variable
fam = "binomial", # distribution family
binom.condition = ">0", # binomial condition (i.e. Harmandia>0)
genet.min = 2, # smallest sample size to use per time period
common.fixed = c("Block","Dist.Edge"), # fixed effects that are not 'covariates'
covars = NULL, # list of covariates
scale.vars=TRUE, # should numeric columns be scaled, including response?
dont.scale = c("Sex.TEMP"), # non-numeric variables (resp or covars) not to scaled
scale.explicit = NULL,
scale.resp = TRUE, # should the response be scaled (only matters if scale.covars)
SNP.no = NULL, # which SNP to use
random.term = "(1|Survey.Year/Survey.Month/Genet.SSRrev)", # random effects term for lme4
data.file = "data/phenos-and-covars.csv", # path to the data file
SNP.file = "data/additive-SNP-data.csv",
verbose = 0 # print info about each model?
){
# define response variable (for binomial distro)
if(fam == "binomial" & !is.null(binom.condition)){
form.resp <- paste0(response,binom.condition)# combine the response and binomial condition
} else {form.resp <- response}
# create fixed effects term
if(length(covars)>0){ # no covariates
fixed.effects <- paste(paste(common.fixed,collapse="+"),
paste(covars,collapse="+"),
sep = "+")
} else { # with covariates
fixed.effects <- paste(common.fixed,collapse="+")
}
# model formula
form <- paste0(form.resp,"~",
paste(fixed.effects,collapse = "+"),
"+",random.term)
# model variables
mod.vars <- unique(c(all.vars(formula(form)),
"survey.event","Genet.SSRrev","SerialNo","GWA.ID")) #always include these
# read data.frame
tmp <- fread(data.file,select=mod.vars)
# scale variables
if(!is.null(covars) & scale.vars){
tmp <- tmp %>% mutate_at(covars[!covars %in% dont.scale], scale) # scale covars not in dont.scale
}
if(scale.vars & fam != "binomial" &!response %in% dont.scale){
tmp <- tmp %>% mutate_at(response, scale) # scale response
}
if(!is.null(scale.explicit)){tmp <- tmp %>% mutate_at(scale.explicit, scale)}
# Add in SNPs, if asked
snp.name = NULL
if(!is.null(SNP.no)){
snp.obj <- add.snps(tmp,which.SNPs = SNP.no)
tmp <- snp.obj$df
snp.name <- snp.obj$snps
form <- paste(form,snp.name, sep = "+")
}
# filter by genet.min
sample.n <- tmp %>% na.omit() %>% #first remove trees with missing data to ensure proper sample n in filter step
group_by(Genet.SSRrev,survey.event) %>% # group by genet
tally() %>% filter(n>=genet.min) # tally remaining individuals per genet (after removing missing data)
tmp <- tmp %>% filter(Genet.SSRrev %in% sample.n$Genet.SSRrev)
if(verbose > 0){ # print model info
cat("fitting model for",as.character(response),"\n",sep = " ")
if(verbose > 1) {
cat(form,"\n")
}
}
if(fam == "normal"){ # fit model
fm <- lmer(data = tmp, formula = form, REML = FALSE) # LMM
} else {
fm <- glmer(data = tmp, formula = formula(form), family = fam) # GLMM
}
return(list("fm" = fm, "response" = response, "snp" = snp.name)) # return the model and response name
}
fit.gwa.model(SNP.no = 10)
## $fm
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## Harmandia > 0 ~ Block + Dist.Edge + (1 | Survey.Year/Survey.Month/Genet.SSRrev) +
## Potra001067.64
## Data: tmp
## AIC BIC logLik deviance df.resid
## 5711.244 5757.567 -2848.622 5697.244 5521
## Random effects:
## Groups Name Std.Dev.
## Genet.SSRrev:(Survey.Month:Survey.Year) (Intercept) 0.6226578
## Survey.Month:Survey.Year (Intercept) 0.3069259
## Survey.Year (Intercept) 0.0000875
## Number of obs: 5528, groups:
## Genet.SSRrev:(Survey.Month:Survey.Year), 1520; Survey.Month:Survey.Year, 4; Survey.Year, 2
## Fixed Effects:
## (Intercept) Block Dist.Edge Potra001067.64
## -1.89224 0.11440 0.03925 0.07515
## convergence code 0; 1 optimizer warnings; 0 lme4 warnings
##
## $response
## [1] "Harmandia"
##
## $snp
## [1] "Potra001067.64"
fit.gwa.model("Vol",fam = "normal",SNP.no = 10)
## $fm
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula:
## Vol ~ Block + Dist.Edge + (1 | Survey.Year/Survey.Month/Genet.SSRrev) +
## Potra001067.64
## Data: tmp
## AIC BIC logLik deviance df.resid
## 12485.542 12538.404 -6234.771 12469.542 5466
## Random effects:
## Groups Name Std.Dev.
## Genet.SSRrev:(Survey.Month:Survey.Year) (Intercept) 0.6837
## Survey.Month:Survey.Year (Intercept) 0.0000
## Survey.Year (Intercept) 0.4396
## Residual 0.5959
## Number of obs: 5474, groups:
## Genet.SSRrev:(Survey.Month:Survey.Year), 1516; Survey.Month:Survey.Year, 4; Survey.Year, 2
## Fixed Effects:
## (Intercept) Block Dist.Edge Potra001067.64
## 0.003035 -0.072268 0.051651 -0.009995
## convergence code 0; 1 optimizer warnings; 0 lme4 warnings
##
## $response
## [1] "Vol"
##
## $snp
## [1] "Potra001067.64"
# function to conduct goodness of fit test for binomial model (Hosmer-Lemeshow)
GOF.binom <- function(
# arguments
gwa.fit, groups = 10, # number of bins to use for grouping
print.table = FALSE, type = 2 # should the contingency table be printed?
){
# other parameters
fm = gwa.fit$fm # fitted model
dat = fm@frame # data frame
fv = fitted(fm) # fitted values
resp = gwa.fit$response # response variable
# first, split the data into quantiles
q = quantile(fv, seq(0,1,1/groups), type = 2)
# then group the fitted values according to quantile
fv.g = cut(fv, breaks = q, include.lowest = TRUE)
# contigency table for fitted values and response values
obs = xtabs(~ fv.g + dat[,1])
# expected values
fit = cbind(e.0 = tapply(1-fv, fv.g, sum), # expected values for FALSE
e.1 = tapply(fv, fv.g, sum)) # and for TRUE
if(print.table){print(cbind(obs,fit))}
chi.sqr = sum((obs - fit)^2/fit) # calc chisqr stat
pval = pchisq(chi.sqr, groups-2, lower.tail = FALSE) # calc p value
# print the results as a data frame
data.frame(test = "Hosmer-Lemeshow",response = resp, groups = groups, chi.sqr = chi.sqr, pvalue = pval)
}
# function to create a qqplot for LMM (only) gwa.fit objects
gwa.qqplot <- function(
gwa.fit, # fitted gwa object
box.prop = .95, # middle x percent of observations to show wit red box
base.size = 8 # base text size for plot
){
ggplot(data = gwa.fit$fm@frame,
aes(sample = resid(gwa.fit$fm))) +
# Add a rectangle to
geom_rect(xmin = -2, xmax = 2,
ymin = quantile(resid(gwa.fit$fm),
probs = 1 - ((1 - box.prop)/2)
),
ymax = quantile(resid(gwa.fit$fm),
probs = (1 - box.prop)/2
),
col = "red", alpha = .01, fill = NA, lty = "dashed", size = .1
) +
geom_qq() + geom_qq_line() +# qq plot
theme_bw(base_size = base.size) +
labs(title = paste("QQ:",gwa.fit$response))
}
# function to make a residual plot for the MM gwa.fit objects
gwa.resid.plot <- function(
gwa.fit, # gwa.fit object
CI.lvl = .95, # confidence interval for smooth
base.size = 8 # base text size for plot
){
ggplot(data = gwa.fit$fm@frame,
aes(y = residuals(gwa.fit$fm, type = "deviance"),
x = fitted(gwa.fit$fm))
) + geom_point() + theme_bw(base_size = base.size) +
labs(title = gwa.fit$response,
y = "resid", x = "fitted val")+
geom_hline(yintercept = 0, lty = "dotted", size = .5, col = "black") +
# add localized regression lines:
geom_smooth(method = "loess", level = CI.lvl, col = "blue")
# geom_smooth(method = "gam", level = CI.lvl, col = "red") +
# geom_smooth(method = "lm", level = CI.lvl, col = "green")
}
# lapply(insect.trait.models, gwa.resid.plot, CI.lvl = .98)
if(params$run.mods){
insect.mods <- lapply(X = common.insects,FUN = fit.gwa.model, verbose = TRUE)
names(insect.mods) <- common.insects
}
Because the insect models have binomial responses, we can perform a “Hosmer-Lemeshow” goodness of fit test, to check the fit of our models.
lapply(insect.mods, GOF.binom) %>% bind_rows() %>% select(response, chi.sqr, pvalue) %>% knitr::kable(digits = 4)
response | chi.sqr | pvalue |
---|---|---|
Aspen.Leaf.Beetle | 38.5631 | 0.0000 |
Blackmine | 447.6036 | 0.0000 |
Cotton.Scale | 17.5987 | 0.0244 |
Leafhoppers | 33.7037 | 0.0000 |
Casebearer.Moth | 135.4415 | 0.0000 |
Blotch.Mine | 35.9745 | 0.0000 |
Cottonwood.Leaf.Mine | 213.9116 | 0.0000 |
Lombardy.Mine | 106.4185 | 0.0000 |
Leaf.Edge.Mine | 108.3890 | 0.0000 |
Weevil.Mine | 14.9096 | 0.0609 |
Green.Sawfly | 32.0219 | 0.0001 |
Phyllocolpa | 463.6605 | 0.0000 |
Pale.Green.Notodontid | 10.5859 | 0.2263 |
Harmandia | 517.4735 | 0.0000 |
Petiole.Gall | 500.5182 | 0.0000 |
Smokey.Aphids | 306.0878 | 0.0000 |
Green.Aphids | 84.1119 | 0.0000 |
Ants | 33.5777 | 0.0000 |
We can see that for nearly all our models, the fit is very good. The Pale.Green.Notodontid
and Weevil.Mine
models have the worst fit, according to this metric.
plots <- lapply(insect.trait.models, gwa.resid.plot)
do.call("grid.arrange", c(plots, ncol=4))
Residual plots are difficult to interpret with binomial data, but local regression of fitted values and residuals seem to be approximately correct. Ideally, the line would be horizontal with y intercept = 0 to indicate that there is no correlation between our predicted outcome and the residuals.
The covariates used for each model are as follows, chosen by exhaustive model selection:
# library(data.table);library(ggplot2)
ggplot(melt(model.table,id.vars=c("response","model.names")),
aes(x = variable, y = response, fill = value)) +
geom_tile(col = "grey") +
scale_fill_manual(values = c("TRUE" = "black", "FALSE" = "white")) +
theme(axis.text.x = element_text(angle = 35, hjust = 1)) +
labs(fill = NULL, title = "Selected Trait Covariates for Insect GWA models")
if(params$run.mods){
variables <- names(model.table)[!names(model.table) %in%
c("response","model.names","Block","Dist.Edge",
"Survey.Year","Survey.Month")]
insect.trait.models <- list()
for(i in 1:nrow(model.table)){
resp = model.table[i,"response"]
values = model.table[i,variables] %>% unlist()
covars <- variables[values]
insect.trait.models[[resp]] <- fit.gwa.model(response = resp,
covars = covars, verbose = TRUE)
}
}
lapply(insect.trait.models, FUN = GOF.binom)%>% bind_rows() %>% select(response, chi.sqr, pvalue) %>% knitr::kable(digits = 4)
response | chi.sqr | pvalue |
---|---|---|
Aspen.Leaf.Beetle | 29.4845 | 0.0003 |
Blackmine | 6.2966 | 0.6140 |
Cotton.Scale | 6.8532 | 0.5526 |
Leafhoppers | 3.4998 | 0.8992 |
Casebearer.Moth | 16.0031 | 0.0423 |
Blotch.Mine | 5.3181 | 0.7231 |
Cottonwood.Leaf.Mine | 46.7989 | 0.0000 |
Lombardy.Mine | 25.8806 | 0.0011 |
Leaf.Edge.Mine | 20.5517 | 0.0084 |
Weevil.Mine | 5.5388 | 0.6987 |
Green.Sawfly | 12.3744 | 0.1353 |
Phyllocolpa | 171.6580 | 0.0000 |
Pale.Green.Notodontid | 4.1648 | 0.8420 |
Harmandia | 61.8971 | 0.0000 |
Petiole.Gall | 177.5860 | 0.0000 |
Smokey.Aphids | 132.3322 | 0.0000 |
Green.Aphids | 61.7514 | 0.0000 |
Ants | 6.5261 | 0.5885 |
Interestingly, the goodness of fit test says that far fewer of or models fit the data well when covariates are included. This is especially interesting, considering that we selected the absolute best models for each insect. However, model selection was done on ‘greater than average’ condition instead of the ‘greater than 0’ condition. However, we’ve shown that these 2 conditions are virtually identical except in the case of Green.Aphids
. Perhaps I should re-run the entire pipe-line, including model selection, to be sure.
Based on the residual plots below, I’m not convinced that the goodness of fit test yields good results for this model. I don’t believe it is correctly accounting for the multiple regression and mixed effects. The degrees of freedom are difficult to define in GLMMs and therefore a chi-sqr test may not be a very meaningful test here.
plots <- lapply(insect.trait.models, gwa.resid.plot)
do.call("grid.arrange", c(plots, ncol=4))
Though the goodness of fit test showed less than ideal results, these residual plots look no worse than those models without covariates. This suggests that The models actually fit our data pretty well.
if(params$run.mods){
normal.trait.mods <- lapply(X = gwa.traits, FUN = fit.gwa.model, fam = "normal",verbose = TRUE,
common.fixed = c("Block","Dist.Edge","Age"))
names(normal.trait.mods) <- gwa.traits
}
Because (most) of the trait data are normal, we can use QQ plots to assess fit.
plots <- lapply(normal.trait.mods,gwa.qqplot)
do.call("grid.arrange", c(plots, ncol=4))
For the most part, our QQ plots also look good. nearly all of our traits’ residuals follow the normal distribution. This is especially true when you consider that the red box encircles 95% of the data and that the deviations from the 1:1 line are almost exclusively in the 2.5% tails.
plots <- lapply(normal.trait.mods, gwa.resid.plot)
do.call("grid.arrange", c(plots, ncol = 4))
The only residual plots that don’t look quite right are initial BA and Volume, to a lesser degree. Since I’m not really interested in how initial size is affected by genotype, I’m not too worried about this.
trait.covar.table$form <- gsub(trait.covar.table$form, pattern = "\\\\",replacement = "")
if(params$run.mods){
variables <- c(gwa.traits,"Sex.TEMP")
trait.covar.models <- list()
for(i in 1:nrow(trait.covar.table)){
resp = trait.covar.table[i,"trait"]
# values = model.table[i,variables] %>% unlist()
mod.fam <- trait.covar.table[i,"family"]
mod.vars <- all.vars(formula(trait.covar.table$form[i]))
covars <- mod.vars[!mod.vars %in%
c(resp,"Block","Dist.Edge",
"Age", "Survey.Year","Survey.Month",
"Genet.SSRrev","survey.event")]
# covars <- mod.vars[mod.vars %in% gwa.traits & !mod.vars %in% resp]
binom.cond <- ifelse(resp == "Sex.TEMP","=='M'",">0")
cat("resp =", resp, "\n")
cat("covars =", covars,"\n")
trait.covar.models[[resp]] <- fit.gwa.model(response = resp,
fam = mod.fam,
binom.condition = binom.cond,
common.fixed = c("Block","Dist.Edge",
"Age"),
scale.vars = TRUE,
covars = covars,
verbose = TRUE)
}
}
plots <- lapply(trait.covar.models, gwa.qqplot)
do.call("grid.arrange", c(plots, ncol=4))
plots <- lapply(trait.covar.models, gwa.resid.plot)
do.call("grid.arrange", c(plots, ncol=4))
set.seed(224479)
SNP.N <- (count.fields("data/additive-SNP-data.csv", sep = ",")[1] - 1)
rand.insect <- sample(common.insects, size = 1)
rand.trait <- sample(gwa.traits[!gwa.traits %in% c("Flprev","Sex.TEMP")], size = 1)
rand.snp <- sample(1:SNP.N,size = 1)
# results data
rand.insect.gwa.path <- file.path("data/master-gwa-results",
paste("InsectGWA",rand.insect,
"master_gwa-results.csv",
sep = "_"))
insect.df <- fread(rand.insect.gwa.path)
insect.df$SNP.name <- gsub(insect.df$SNP.name, pattern = ":",replacement = ".")
insect.rand.sig.snp <- insect.df %>% filter(pval.chisqr <= .05) %>% select(SNP.no) %>%
unlist() %>% unname() %>% sample(size = 1)
rand.trait.gwa.path <- file.path("data/master-gwa-results",
paste("TraitGWA",rand.trait,
"master_gwa-results.csv",
sep = "_"))
trait.df <- fread(rand.trait.gwa.path)
trait.df$SNP.name <- gsub(trait.df$SNP.name, pattern = ":",replacement = ".")
trait.rand.sig.snp <- trait.df %>% filter(pval.chisqr <= .05) %>% select(SNP.no) %>%
unlist() %>% unname() %>% sample(size = 1)
# models
fm.1 <- fit.gwa.model(response = rand.insect, fam = "binomial",binom.condition = ">0",
genet.min = 2, common.fixed = c("Block","Dist.Edge"),
random.term = "(1|Survey.Year/Survey.Month/Genet.SSRrev)",
scale.vars = TRUE, dont.scale = "Harmandia",
SNP.no = rand.snp, scale.explicit = c("Dist.Edge")
)
fm.sig.1 <- fit.gwa.model(response = rand.insect, fam = "binomial",binom.condition = ">0",
genet.min = 2, common.fixed = c("Block","Dist.Edge"),
random.term = "(1|Survey.Year/Survey.Month/Genet.SSRrev)",
scale.vars = TRUE, dont.scale = "Harmandia",
SNP.no = insect.rand.sig.snp, scale.explicit = c("Dist.Edge")
)
fm.2 <- fit.gwa.model(response = rand.trait, fam = "normal", genet.min = 2,
common.fixed = c("Block","Dist.Edge","Age"),
scale.vars = TRUE,
SNP.no = rand.snp, scale.explicit = c("Dist.Edge"))
fm.sig.2 <- fit.gwa.model(response = rand.trait, fam = "normal", genet.min = 2,
common.fixed = c("Block","Dist.Edge","Age"),
scale.vars = TRUE,
SNP.no = trait.rand.sig.snp, scale.explicit = c("Dist.Edge"))
# compare with current gwa function
fixef(fm.1$fm)
## (Intercept) Block Dist.Edge Potra001586.2395
## 0.3934001 -0.3154390 0.4062219 0.2415222
car::Anova(fm.1$fm)
insect.df[match(fm.1$snp, insect.df$SNP.name),]
fixef(fm.sig.1$fm)
## (Intercept) Block Dist.Edge Potra003937.38342
## 0.4515095 -0.3157512 0.4054067 -0.1932408
car::Anova(fm.sig.1$fm)
insect.df[match(fm.sig.1$snp, insect.df$SNP.name),]
fixef(fm.2$fm)
## (Intercept) Block Dist.Edge Age
## -1.19131497 0.01334985 0.12974246 0.19760628
## Potra001586.2395
## -0.06748869
car::Anova(fm.2$fm)
trait.df[match(fm.2$snp, trait.df$SNP.name)]
fixef(fm.sig.2$fm)
## (Intercept) Block Dist.Edge Age
## -1.18102974 0.01388521 0.13048902 0.19891308
## Potra173032.8857
## -0.11861115
car::Anova(fm.sig.2$fm)
trait.df[match(fm.sig.2$snp, trait.df$SNP.name)]
if(params$save){
save(insect.mods,normal.trait.mods,insect.trait.models,trait.covar.models,
file = "data/null-gwa-models.RData")
}