This document is meant to analyze the results of GWA analyses for insect herbivores and aspen traits as they relate to insect herbivores.
First, we’ll setup this document by loading options, packages, and functions:
GWA.results.sig <- list()
set.seed(954) # random numbers always the same
# define traits of interest
gwa.traits <- c("Hobs","BA.2012sqrt","Sex.TEMP","SLA","ALA","CT","PG","Npct",
"CN","BAsqrt","Vol","EFNMean","BBreakDegDay","Flprev")
diversity.traits <- c("richness","shannon.index","evenness",paste0("MDS",1:4))
environ.vars <- c("SerialNo","Genet.SSRrev",
"Block","Dist.Edge","Age",
"Survey.Year","Survey.Month",
"survey.event")
common.insects <- unname(unlist(fread("data/common-insects.txt",
header = FALSE)))
insect.vars <- c(common.insects,"Min.per.Tree")
SNP.N <- 113674 # total number of SNPs
# get a list of insect gwa files
insect.files <- grep(list.files(params$gwa.results.dir),value = TRUE,
pattern = "^InsectGWA_")
instrait.files <- grep(list.files(params$gwa.results.dir),value = TRUE,
pattern = "^InsTraitGWA_")
trait.files <- grep(list.files(params$gwa.results.dir),value = TRUE,
pattern = "^TraitGWA_")
# exclude sex and hobs files
trait.files <- trait.files[!grepl(trait.files,pattern = ".*(Sex.TEMP|Hobs).*")]
# crosstrait.files <- grep(list.files(params$gwa.results.dir),value = TRUE,
# pattern = "^crossed-effects_")
covartrait.files <- grep(list.files(params$gwa.results.dir),value = TRUE,
pattern = "^CovarTraits_")
covartrait.files <- covartrait.files[!grepl(covartrait.files, pattern = ".*(Sex.TEMP|Hobs|Flprev).*")]
diversity.files <- grep(list.files(params$gwa.results.dir),value = TRUE,
pattern = "^Diversity")
Indv.trait.files <- grep(list.files(params$gwa.results.dir),value = TRUE,
pattern = "^Indv-randeffects")
Indv.trait.files <- Indv.trait.files[!grepl(Indv.trait.files, pattern = ".*(BA\\.2012sqrt.|Hobs).*")]
CompleteCross.trait.files <- grep(list.files(params$gwa.results.dir), value = TRUE, pattern = "CompleteCross")
CompleteCross.trait.files <- CompleteCross.trait.files[!grepl(CompleteCross.trait.files, pattern = ".*(BA\\.2012sqrt.|Hobs).*")]
IndvCrossed.trait.files <- grep(list.files(params$gwa.results.dir), value = TRUE, pattern = "IndvCrossed")
IndvCrossed.trait.files <- IndvCrossed.trait.files[!grepl(IndvCrossed.trait.files, pattern = ".*(BA\\.2012sqrt.|Hobs).*")]
CrossNestIndv.trait.files <- grep(list.files(params$gwa.results.dir), value = TRUE, pattern = "CrossNestIndv")
CrossNestIndv.trait.files <- CrossNestIndv.trait.files[!grepl(CrossNestIndv.trait.files, pattern = ".*(BA\\.2012sqrt.|Hobs).*")]
YearNest.trait.files <- grep(list.files(params$gwa.results.dir), value = TRUE, pattern = "YearNest")
YearNest.trait.files <- YearNest.trait.files[!grepl(YearNest.trait.files, pattern = ".*(BA\\.2012sqrt.|Hobs).*")]
# annotated genes list
anno.genes <- fread("data/gwa-files/annotated-potra-genes.csv")
# results function
sig.GWA <- function(files.list = insect.files,
method = "qval", #or 'bonf'
sig.lvl = 0.05, fdr.lvl = 0.05,
full.results = TRUE){
GWA.results.sig <- data.frame()
GWA.sig.table <- data.frame()
full.results.df <- data.frame()
return.list <- list()
for(f in files.list){
# attributes
gwa.type <- unlist(strsplit(f,split = "_"))[1]
trait <- unlist(strsplit(f,split = "_"))[2] #trait name
path <- file.path(params$gwa.results.dir,f) #path to file
stopifnot(file.exists(path)) #check that the file exists
# data
tmp <- fread(path,header = TRUE) #read the file
tmp$trait <- trait
tmp$gwa.type <- gwa.type
tmp <- tmp %>% #add scaffold and within-scaffold base pair
mutate(scaff.name = gsub(SNP.name,pattern = "(.*):(.*)",
replacement = "\\1"),
scaff.bp = gsub(SNP.name,pattern = "(.*):(.*)",
replacement = "\\2"))
if(method == "qval"){
qvalues <- qvalue(p = tmp$pval.chisqr,fdr.level = fdr.lvl,lfdr.out = FALSE) #calculate q
tmp$qval <- qvalues$qvalues #add q to data
# add in gene annotations
# scaff <- tmp$scaff.name
# bp <- tmp$scaff.bp
# test <- mapply(x = scaff, y = bp, FUN = function(x,y){
# anns <- anno.genes %>% filter(Chrom == x,START <= y, END >= y)
# # anns <- ifelse(nrow(anns) < 1,NA,no = anns)
# if(nrow(anns) > 0){
# return(anns %>% select(Potra_Gene_ID,mean_alleleFreq,Functional_Description))
# # which( (anno.genes$Chrom == x) &
# # (anno.genes$START <= y) &
# # (anno.genes$END >= y))
# } else { return(data.frame(Potra_Gene_ID = NA,
# mean_alleleFreq = NA,
# Functional_Description = NA))}
# }) %>% t
# significant results
sig.data <- tmp %>% filter(qval <= sig.lvl) %>% #remove non-significant rows
select(gwa.type,trait,SNP.name,scaff.name,scaff.bp,#id columns
Estimate,Std.Error,pval.chisqr,qval)#result columns
GWA.results.sig <- rbind(GWA.results.sig, sig.data)
GWA.sig.table <- rbind(GWA.sig.table,
data.frame(gwa.type,trait,
n = length(which(tmp$qval <= sig.lvl))))
}
if(method == "bonf"){
tmp$bonf.p <- tmp$pval.chisqr*nrow(tmp)
sig.data <- tmp %>% filter(pval.chisqr <= sig.lvl/nrow(tmp)) %>%
select(gwa.type,trait,SNP.name,scaff.name,scaff.bp,
Estimate,Std.Error,pval.chisqr,bonf.p)
GWA.results.sig <- rbind(GWA.results.sig, sig.data)
GWA.sig.table <- rbind(GWA.sig.table,
data.frame(gwa.type,trait,
n = length(which(tmp$bonf.p <= sig.lvl))))
}
if(full.results){
full.results.df <- plyr::rbind.fill(full.results.df,tmp)
# full.results.df <- rbindlist(list(full.results.df,tmp),fill = TRUE,use.names = TRUE)
}
}
# add an overall q value
if(full.results){
if(method == "qval"){
full.results.df$q.all <- qvalue(p = full.results.df$pval.chisqr,
fdr.level = fdr.lvl,
lfdr.out = FALSE)$qvalues
full.results.df <- full.results.df %>%
select(gwa.type,trait,SNP.name,scaff.name,scaff.bp,#id columns
Estimate,Std.Error,pval.chisqr,qval,q.all)
}
if(method == "bonf"){
full.results.df$bonf.all <- full.results.df$pval.chisqr*nrow(full.results.df)
full.results.df <- full.results.df %>%
select(gwa.type,trait,SNP.name,scaff.name,scaff.bp,#id columns
Estimate,Std.Error,pval.chisqr,bonf.p,bonf.all)
}
}
# build return list
return.list$sig.df <- GWA.results.sig
return.list$sig.tab <- GWA.sig.table
return.list$full.results.df <- full.results.df
return(return.list)
}
Here we will compile a list of results from each model to use in the remaining sections.
# insect results
insect.sig.gwa <- sig.GWA(insect.files) #no traits
instrait.sig.gwa <- sig.GWA(instrait.files) #with traits
# trait results
trait.sig05.gwa <- sig.GWA(trait.files,sig.lvl = .05)#.05 cutoff
trait.sig01.gwa <- sig.GWA(trait.files,sig.lvl = .01)#.01 cutoff
trait.sig005.gwa <- sig.GWA(trait.files,sig.lvl = .005)#.005 cutoff
indv.trait.gwa <- sig.GWA(files.list = Indv.trait.files, sig.lvl = .05)
covartrait.sig05.gwa <- sig.GWA(covartrait.files, sig.lvl = .05)
covartrait.sig01.gwa <- sig.GWA(covartrait.files, sig.lvl = .01)
# crosstrait.sig.gwa <- sig.GWA(grep( pattern = ".*(Sex|Flprev).*",x = crosstrait.files,value = T,invert = TRUE))
diversity.sig05.gwa <- sig.GWA(diversity.files, sig.lvl = .05)
CrossNestIndv.GWA <- sig.GWA(CrossNestIndv.trait.files, sig.lvl = .05)
CompleteCross.GWA <- sig.GWA(CompleteCross.trait.files, sig.lvl = .05)
IndvCrossed.GWA <- sig.GWA(IndvCrossed.trait.files, sig.lvl = .05)
YearNest.GWA <- sig.GWA(YearNest.trait.files, sig.lvl = .05)
For model fit diagnostics see this document.
# insect.sig.gwa %>% group_by(trait) %>% tally()
(p <- ggplot(data = rbind(insect.sig.gwa$sig.tab,instrait.sig.gwa$sig.tab),
aes(x = trait, y = n, fill = gwa.type)) +
geom_bar(stat = "identity",position = "dodge", col = "grey50", width=.75) +
geom_text(aes(label = round(n/SNP.N, digits = 2)),
# geom_text(aes(label = n),
position = position_dodge(width = 1),
size = 2.5,
# angle = 25,
vjust = -.25,
hjust = .5,
col = "black") +
theme_bw(base_size = 10) +
theme(axis.text.x = element_text(angle = 25,hjust = 1),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank()) +
labs(y = "Significant Associations")
) %+% labs(title = "Insect GWA Results")
merge(insect.sig.gwa$sig.tab %>% select(-gwa.type),instrait.sig.gwa$sig.tab %>% select(-gwa.type), by = "trait", suffixes = c(".InsectGWA",".InsTraitGWA"))
These are the results that I would expect. There are a reasonable number of associations for the insects and they decrease (mostly) as trait covariates are added to the model.
# trait.full.tab <- trait.full.res %>% filter(q.all < .05) %>% group_by(trait) %>% tally()
# trait.full.tab$gwa.type = "trait-full";trait.full.tab$q.cutoff="0.05 (all)"
trait.comp.dat <- rbind(cbind(trait.sig05.gwa$sig.tab,
"q.cutoff" = "0.05", "covars" = "no covars"),
# trait.full.tab,
cbind(trait.sig01.gwa$sig.tab,
"q.cutoff" = "0.01", "covars" = "no covars"),
# cbind(trait.sig005.gwa$sig.tab,"q.cutoff" = "0.005"),
cbind(covartrait.sig05.gwa$sig.tab,
"q.cutoff" = "0.05","covars" = "with covars"),
cbind(covartrait.sig01.gwa$sig.tab,
"q.cutoff" = "0.01","covars" = "with covars"))
p %+% (data = trait.comp.dat) %+% aes(fill = q.cutoff) %+%
labs(title = "Trait GWA Results", fill = "q value cutoff") +
facet_wrap(~covars) + theme(axis.text.x = element_text(angle = 45))
total.trait.assoc.prop01 <- (trait.sig01.gwa$sig.df$SNP.name %>% unique() %>% length())/SNP.N
total.covtrait.assoc.prop <- (covartrait.sig01.gwa$sig.df$SNP.name %>% unique() %>% length())/SNP.N
total.trait.assoc.prop001 <- (covartrait.sig01.gwa$sig.df %>%
filter(qval <= .001) %>%
select(SNP.name) %>% unlist() %>%
unique() %>% length())/SNP.N
In general, there is a relatively high percentage of associations with our traits when using a cutoff of .05, especially when compared with insects. What could be causing that? Is it possible that the random survey event effect is not appropriate for trait data? Using a cutoff of .01 gives more conservative estimates.
With the .01 cutoff, and no trait covariates, 0.62% of all SNPs are associated with at least one of these traits and at a .001 cutoff, 0.26% of SNPs are trait-associated.
Sex will remain highly associated with our SNPs because it is completely confounded with Genet. An individual component of genet (a SNP) can explain the sex of an individual as well as the complete genet information. Sex consistently had 46% of the SNPs come up as significant (random chance). For these reasons, Sex-associated SNPs will not be considered.
It should be noted that this method of GWA is meant to be sensitive, in order to detect associations with extended phenotypes (insects) and so, it makes sense that associations with directly related traits are more numerous.
It is important to remember that these are corralative associations NOT causative ones. The SNPs that are associated with Sex are NOT those that determine sex, they are those whose genotype differs by sex, on average.
Additionally, the traits are relatively highly correlated:
# numeric trait data
trait.data <- fread("data/phenos-and-covars.csv",
select = c("Block","Dist.Edge",
# diversity.traits,
gwa.traits,
"richness","shannon.index","evenness"
),
stringsAsFactors = TRUE)
trait.data.num <- trait.data %>%
na.omit() %>% mutate_at(c("Sex.TEMP"), as.numeric)
# correlation plot function
corr.tile.plot <- function(data = trait.data.num, corr.meth = "pearson",
high.col = "red",low.col = "blue",text.size = 10){
cor(data,method = corr.meth) %>%
melt() %>%
ggplot(aes(x = Var1, y = Var2,
fill = ifelse(value == 1, NA, value))) + #don't show diag in col.
geom_tile(col = "black",size = .25) +
geom_text(aes(label = round(value,2)),size = text.size*.225) +
scale_fill_gradient2(low = low.col, high = high.col, mid = "white") +
labs(fill = "pearson's r",y=NULL,x=NULL,title = "Trait Correlation Matrix") +
theme_bw(base_size = text.size) +
theme(axis.text.x = element_text(angle = 35,hjust=1),
panel.border = element_blank(),
panel.grid = element_blank(), axis.text = element_text(face = "bold"))
}
corr.tile.plot(trait.data.num)
I’ll now build models to correct for these correlations (code hidden). I’ll include traits as covariates in a model if there is a correlation with an absolute value > .15. Again, Block
, Age
, Dist.Edge
, and (1|Survey.Year/Survey.Month/Genet.SSRrev)
were used in models for each trait (including) diversity indices.
We can see that the traits from SLA
to Vol
are fairly intercorrelated. This means that, since the trait GWA models don’t include other traits as covariates, a SNP may be detected that is actually associated with a correlated trait, and not the actual trait of interest. This will especially be true for Sex.TEMP
because, even though it is not directly correlated with our traits, it is very highly correlated with genet. Therefore, strong genet associations will be correlated with sex associations.
Additionally, I decided to look at different random effects specifications for the trait responses (no covariates):
trait.comp.2 <- rbind(
# cbind(trait.sig05.gwa$sig.tab, "label" = ".05"),
# cbind(trait.sig01.gwa$sig.tab, "label" = ".01"),
cbind(indv.trait.gwa$sig.tab, "label" = ".05,ind-RE"),
cbind(CompleteCross.GWA$sig.tab, "label" = ".05,CompCross"),
cbind(CrossNestIndv.GWA$sig.tab, "label" = ".05,CrossNest"),
cbind(IndvCrossed.GWA$sig.tab, "label" = ".05,IndvCrossed"),
cbind(YearNest.GWA$sig.tab, "label" = ".05,YearNest"))
p %+% (data = trait.comp.2) %+% aes(fill = label) %+%
labs(title = "Triat GWA Comparison", fill = "trait model") +
theme(axis.text.x = element_text(angle = 45))
#sigtabs for different random effects structures
rbind(CrossNestIndv.GWA$sig.tab,
CompleteCross.GWA$sig.tab,
IndvCrossed.GWA$sig.tab,
YearNest.GWA$sig.tab)
It appears that the CrossNest
random effect version ((1|year/month)+(1|month/genet)+(1|ID)
) yields the the more favorable results, in terms of # of associations. However, this requires a relaxed assumption of correlation (among observations) when compared to the true design (i.e. CompCross
: (1|year)+(1|month)+(1|genet)+(1|ID)
). This may not be appropriate or justified.
The CompCross
model is the most justified for identifying associations for traits only (without considering insects) but we can see that this method does not identify many SNPs for traits. In fact only BBreakDegDay
and FlPrev
had any associations identified by this method. In fact, the IndvCrossed
((1|year/month)+(1|genet)+(1|ID)
) specification yields identical results. These associations are the ones we would use as candidate SNPs for our traits if we were strictly interested in trait GWA.
Here we will look at the Diversity GWA
p %+% (data = diversity.sig05.gwa$sig.tab) +
labs(title = "Diversity GWA Results") %+%
theme(axis.text.x = element_text(angle = 45))
diversity.sig05.gwa$sig.tab %>% select(trait,n)
No Associations were detected for any MDS axis and only 2 were found forshannon index, however species richness and evenness had a few hits.
shared.SNP.plot(GWA.res.list = diversity.sig05.gwa, sig.lvl = .05,
prop = TRUE,
GWA.list2 = trait.sig05.gwa, sig.lvl2 = .01)
We can see that the richness and evenness associations are largely correlated with budbreak and EFN associations as well. Additionally, one of the 2 shannon traits index-associated was associated with BA and EFNs and all of them were associated with volume.
shared.SNP.plot(GWA.res.list = diversity.sig05.gwa, sig.lvl = .05,
prop = TRUE,
GWA.list2 = insect.sig.gwa, sig.lvl2 = .01)
A few of the diversity-associated SNPs are also common insect-associated SNPs. Unsurprisingly, Green Aphid associations are correlated with evenness associations. This is likely due to the fact that aphid populations are often in the thousands when present. Communities that contain this many aphids, and relatively few individuals from other species, will be more uneven than those with small numbers of aphids.
Now we’ll look at the number of Insect associated SNPs are also associated with traits. For now, we’ll use only one insect Phyllocolpa
for visualization and the TraitGWA
results (random effects specification: (1|year/month/genet)
) because the model is directly comparable to the insect models.
phyll <- sig.GWA(files.list = "InsectGWA_Phyllocolpa_master_gwa-results.csv",
sig.lvl = .05)$sig.df
phyll.covar <- sig.GWA(files.list = "InsTraitGWA_Phyllocolpa_master_gwa-results.csv",
sig.lvl = .05)$sig.df
# BA.001q <- sig.GWA(files.list = "TraitGWA_BAsqrt_master_gwa-results.csv",
# sig.lvl = .001)$sig.df
phyll.traits <- c("BA.2012sqrt","SLA","CT","Npct","CN","BAsqrt","Vol","BBreakDegDay")
trait.mods <- lapply(phyll.traits, FUN = function(x){
df <- sig.GWA(files.list = paste("TraitGWA",x,"master_gwa-results.csv", sep = "_"),
sig.lvl = .01)$sig.df
return(df)
})
trait.mods.df <- trait.mods %>% bind_rows()
trait.SNPs <- unique(trait.mods.df$SNP.name)
# Alternatively
# new.sig.df <- trait.sig05.gwa$full.results.df %>% filter(q.all <= .05)
# trait.SNPS <- unique(new.sig.df$SNP.name)
# Or
trait.sig05.bonf.gwa <- sig.GWA(trait.files,sig.lvl = .05,method = "bonf")
new.sig.df <- trait.sig05.bonf.gwa$full.results.df %>% filter(bonf.p <= .05)
# trait.SNPS <- new.sig.df %>% filter(bonf.p <= .05) %>% pull(SNP.name)
trait.SNPS <- new.sig.df %>% filter(bonf.all <= .05) %>% pull(SNP.name) #or
# intersection table
A <- phyll$SNP.name
B <- phyll.covar$SNP.name
C <- trait.SNPs
rbind(lenth = c(A = length(A),B = length(B),C = length(C)))
## A B C
## lenth 3062 1891 48823
# X and Y
AB <- intersect(A,B)
AC <- intersect(A,C) #
BC <- intersect(B,C) #
ABC <- Reduce(intersect, list(A,B,C))
# X but not Y
AnB <- A[! A %in% B] #
AnC <- A[! A %in% C]
BnC <- B[! B %in% C]
# calculate the proportion of SNPs mediated by traits
all.SNPs <- unique(c(A,B,AB,BC))
trait.mediated.SNPS <- unique(c(AnB,AC,BC))
prop.mediated <- length(trait.mediated.SNPS)/length(all.SNPs)
library(VennDiagram)
# Draw a venn diagram to show this
grid.newpage()
draw.triple.venn(area1 = length(A), area2 = length(B), area3 = length(C),
n12 = length(AB), n13 = length(AC), n23 = length(BC),
n123 = length(ABC),
category = c("Phyllocolpa GWA","Phyllocolpa GWA\n(trait covars)","TraitGWA"),
scaled = TRUE, euler.d = TRUE,
# fill = c("green","blue","red"),
cat.col = "black", #c("green","blue","red")
# lty = "blank", main = "GWA Intersection Plot"
)
## (polygon[GRID.polygon.939], polygon[GRID.polygon.940], polygon[GRID.polygon.941], polygon[GRID.polygon.942], polygon[GRID.polygon.943], polygon[GRID.polygon.944], text[GRID.text.945], text[GRID.text.946], text[GRID.text.947], text[GRID.text.948], text[GRID.text.949], text[GRID.text.950], text[GRID.text.951], text[GRID.text.952], text[GRID.text.953], text[GRID.text.954])
round(prop.mediated*100, 2)
## [1] 69.12
We can see that 69.12% of the Associations found for Phyllocolpa
are either associated with or directly dependent upon a phyllocolpa-associated trait.