Multiple GWA Blup Test

This document will use selected trait data from 2016 and 2017 to investigate the best methods for Blup GWA as it relates to multivariate GWA.

# load data and libraries ----
data.full <- fread("data/phenos-and-covars.csv")

We will consider all of the following terms in the final GWA model:

# define terms to use ----
(block.vars <- c("Block","Dist.Edge","Survey.Year","Age"))
## [1] "Block"       "Dist.Edge"   "Survey.Year" "Age"
(trait.vars <- c("BAsqrt","BBreakDegDay","PG","CT","Npct","CN","SLA","ALA"))
## [1] "BAsqrt"       "BBreakDegDay" "PG"           "CT"          
## [5] "Npct"         "CN"           "SLA"          "ALA"
(genet <- c("Genet.SSRrev"))
## [1] "Genet.SSRrev"

and “PG” will be our response variable.

# reduce the data ----
data <- data.full %>%
  filter(Survey.Month == "Jun") %>% # only consider one time point per year
  select(block.vars,trait.vars,genet) %>% # only keep columns of interest
  mutate_at(c("Block","Genet.SSRrev","Survey.Year"), as.factor) %>% # convert fo factor
  mutate_at(trait.vars,scale) %>% # scale the trait data 
  na.omit()

Create fake SNPs

First, I’ve created imaginarys SNP whose genotype value is truly associated with PG and BAsqrt respectively. We will then regress this SNP on the BLUPs (with and without other covars) that we create for PG to see if we detect an association with fake.SNP.PG and/or falsely detect an association with fake.SNP.BA. BAsqrt was chosen because it is moderately correlated with PGs.

# create a fake SNP to test this on
## This SNP is truly associated with BAsqrt
genet.mean.ba <- data %>% group_by(Genet.SSRrev) %>% summarize(BAsqrt = mean(BAsqrt,na.rm=TRUE))
genet.mean.ba$fake.SNP <- ifelse(genet.mean.ba$BAsqrt <= quantile(genet.mean.ba$BAsqrt, 1/3),
                                 yes = 0, no = ifelse(genet.mean.ba$BAsqrt >= quantile(genet.mean.ba$BAsqrt, 2/3),
                                                      yes = 1, no = 2))

data$fake.SNP.BA <- genet.mean.ba$fake.SNP[match(data$Genet.SSRrev, genet.mean.ba$Genet.SSRrev)]

ggplot(data = data, aes(y = BAsqrt, x = fake.SNP.BA)) + 
  geom_point() +
  geom_smooth(method = "lm") + labs(x = "fake.SNP.BA Genotype", 
                                    title = "Association of fake.SNP.BA with BAsqrt")

ggplot(data = data, aes(y = PG, x = fake.SNP.BA)) + 
  geom_point() +
  geom_smooth(method = "lm") + labs(x = "fake.SNP.BA Genotype", 
                                    title = "Association of fake.SNP.BA with PG")

# create a fake SNP to test this on
## This SNP is truly associated with PG
genet.mean.PG <- data %>% group_by(Genet.SSRrev) %>% summarize(PG = mean(PG,na.rm=TRUE))
genet.mean.PG$fake.SNP <- ifelse(genet.mean.PG$PG <= quantile(genet.mean.PG$PG, 1/3),
                                 yes = 0, no = ifelse(genet.mean.PG$PG >= quantile(genet.mean.PG$PG, 2/3),
                                                      yes = 1, no = 2))

data$fake.SNP.PG <- genet.mean.PG$fake.SNP[match(data$Genet.SSRrev, genet.mean.PG$Genet.SSRrev)]

ggplot(data = data, aes(y = BAsqrt, x = fake.SNP.PG)) + 
  geom_point() +
  geom_smooth(method = "lm") + labs(x = "fake.SNP.PG Genotype", 
                                    title = "Association of fake.SNP.PG with BA")

ggplot(data = data, aes(y = PG, x = fake.SNP.PG)) + 
  geom_point() +
  geom_smooth(method = "lm") + labs(x = "fake.SNP.PG Genotype", 
                                    title = "Association of fake.SNP.PG with PG")

Fit Blup Models

Here we will fit 3 blup models. Each of them will contain our blocking variables Block, Dist.Edge, Ageand genet as a random effect but will differ in what traits they contain. The first blup model: fm1 contains no traits, fm2 contains only BAsqrt, and fm3 contains all traits.

Below are anova tables for each model:

# fit LMM without trait covars ----

fm1 <- lmer(data = data, formula = "PG ~ Block + Dist.Edge + Age + (1|Genet.SSRrev)")
fm2 <- lmer(data = data, "PG ~ Block + Dist.Edge + Age + BAsqrt + (1|Genet.SSRrev)")
fm3 <- lmer(data = data, "PG ~ Block + Dist.Edge + Age + BAsqrt + CN + SLA + ALA +
            CT + BBreakDegDay + Npct + (1|Genet.SSRrev)")

car::Anova(fm1)
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: PG
##            Chisq Df Pr(>Chisq)  
## Block     7.4250  3    0.05952 .
## Dist.Edge 2.3495  1    0.12532  
## Age       4.0141  1    0.04512 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::Anova(fm2)
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: PG
##             Chisq Df Pr(>Chisq)    
## Block      7.8270  3   0.049725 *  
## Dist.Edge  5.0988  1   0.023942 *  
## Age        7.7485  1   0.005376 ** 
## BAsqrt    36.9848  1  1.191e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::Anova(fm3)
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: PG
##                 Chisq Df Pr(>Chisq)    
## Block         33.9567  3  2.023e-07 ***
## Dist.Edge      6.7882  1   0.009176 ** 
## Age            4.9483  1   0.026116 *  
## BAsqrt        49.3582  1  2.132e-12 ***
## CN            58.4305  1  2.106e-14 ***
## SLA           58.7589  1  1.782e-14 ***
## ALA           42.5028  1  7.058e-11 ***
## CT           317.2146  1  < 2.2e-16 ***
## BBreakDegDay  35.5490  1  2.487e-09 ***
## Npct          27.7410  1  1.387e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We can see from fm2 and fm3 that BAsqrt explains a significant portion of the variation in PG. This will be important later

Blup differences

How does including traits affect the genet-specific BLUPs generated by lmer()?

Here, I’ve extracted the BLUPs from our 3 models and plotted them together

blup.1 <- ranef(fm1)
blup.2 <- ranef(fm2)
blup.3 <- ranef(fm3)

blup.data <- data.table("genet" = rownames(blup.1$Genet.SSRrev),
                        "fm1" = blup.1$Genet.SSRrev[rownames(blup.1$Genet.SSRrev),], 
                        "fm2" = blup.2$Genet.SSRrev[rownames(blup.1$Genet.SSRrev),],
                        "fm3" = blup.3$Genet.SSRrev[rownames(blup.1$Genet.SSRrev),])
blup.melt <- melt(blup.data,id.vars = "genet")

g <- ggplot(data = blup.melt %>% #only look at 20 genets
              filter(genet %in% levels(data$Genet.SSRrev)[1:20]), 
            aes(y = value, x = genet, fill = variable)) +
  geom_bar(stat = "identity",position = "dodge")
plot(g)

We can see that adding trait covariates to the model from which BLUPs are extracted does indeed influence the actual value of our BLUP. This is because our BLUP is “overall genet-specific variation that influences PG variation”. That genet-specific explanatory power (genet random effect, genet residial) can be partitioned out (or accounted for) into speficic sub-components (genet-speficic BA variation is a subset of overall genet-specific variation etc.).

In each of these models the BLUP is the “genet-speficic effect on PG that is not explained with the traits”. We’ll get back to this point too.

Conduct GWA on our Blups

Here we will run a GWA analysis to test if our SNP is associated with our BLUP. To do so, I will add all terms to the GWA model that were not included in the BLUP model. For example: fm1 contained no traits so gwa.1 then contains all traits and fm3 contained all the traits so gwa.3 won’t contain any. In this way, we ensure that all traits are accounted for during one of the steps.

# fit gwa model ----
## first add blups into data
data$blup.1 <- blup.data$fm1[match(x = data$Genet.SSRrev, blup.data$genet)]
data$blup.2 <- blup.data$fm2[match(x = data$Genet.SSRrev, blup.data$genet)]
data$blup.3 <- blup.data$fm3[match(x = data$Genet.SSRrev, blup.data$genet)]


gwa.1 <- lm(data = data, "blup.1 ~ BAsqrt + CN + SLA + ALA +
            CT + BBreakDegDay + Npct + fake.SNP.PG + fake.SNP.BA")
gwa.2 <- lm(data = data, "blup.2 ~ CN + SLA + ALA +
            CT + BBreakDegDay + Npct + fake.SNP.PG + fake.SNP.BA")
gwa.3 <- lm(data = data, "blup.3 ~ fake.SNP.PG + fake.SNP.BA")

car::Anova(gwa.1)
## Anova Table (Type II tests)
## 
## Response: blup.1
##               Sum Sq   Df   F value    Pr(>F)    
## BAsqrt          0.09    1    0.2282  0.632873    
## CN            158.13    1  390.8964 < 2.2e-16 ***
## SLA            70.88    1  175.2162 < 2.2e-16 ***
## ALA             1.73    1    4.2683  0.038913 *  
## CT            576.94    1 1426.2282 < 2.2e-16 ***
## BBreakDegDay   46.75    1  115.5597 < 2.2e-16 ***
## Npct           91.79    1  226.9158 < 2.2e-16 ***
## fake.SNP.PG   108.86    1  269.1121 < 2.2e-16 ***
## fake.SNP.BA     2.81    1    6.9477  0.008435 ** 
## Residuals    1233.78 3050                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::Anova(gwa.2)
## Anova Table (Type II tests)
## 
## Response: blup.2
##               Sum Sq   Df   F value    Pr(>F)    
## CN            154.58    1  384.3264 < 2.2e-16 ***
## SLA            80.73    1  200.7094 < 2.2e-16 ***
## ALA             2.33    1    5.7859  0.016214 *  
## CT            579.13    1 1439.8750 < 2.2e-16 ***
## BBreakDegDay   42.02    1  104.4791 < 2.2e-16 ***
## Npct           87.43    1  217.3739 < 2.2e-16 ***
## fake.SNP.PG   107.86    1  268.1790 < 2.2e-16 ***
## fake.SNP.BA     2.92    1    7.2703  0.007049 ** 
## Residuals    1227.13 3051                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::Anova(gwa.3)
## Anova Table (Type II tests)
## 
## Response: blup.3
##              Sum Sq   Df  F value Pr(>F)    
## fake.SNP.PG  182.63    1 435.7383 <2e-16 ***
## fake.SNP.BA    0.97    1   2.3093 0.1287    
## Residuals   1281.28 3057                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We can see that all gwa models identified a significant association between our BLUP and fake.SNP.PG. However, both gwa.1 and gwa.2 identified a significant association with fake.SNP.BA as well.

Let’s visualize the BLUP ~ SNP relationship:

pg.snp.plot <- ggplot(data = data, aes(x = fake.SNP.PG)) + 
  geom_point(aes(y = blup.1), col = "red",) + 
  geom_smooth(aes(y = blup.1),method = "lm", col = "red") +
  
  geom_point(aes(y = blup.2, x = fake.SNP.PG + .025), col = "blue") +
  geom_smooth(aes(y = blup.2, x = fake.SNP.PG + .025),method = "lm", col = "blue",linetype = "dashed") +
 
  geom_point(aes(y = blup.3, x = fake.SNP.PG - .025), col = "green") +
  geom_smooth(aes(y = blup.3, x = fake.SNP.PG + .025),method = "lm", col = "green",linetype = "dashed") + labs(y = "blup value")

pg.snp.plot

All three BLUPs have a clear association with fake.SNP.PG

ba.snp.plot <- ggplot(data = data, aes(x = fake.SNP.BA)) + 
  geom_point(aes(y = blup.1), col = "red",) + 
  geom_smooth(aes(y = blup.1),method = "lm", col = "red") +
  
  geom_point(aes(y = blup.2, x = fake.SNP.BA + .025), col = "blue") +
  geom_smooth(aes(y = blup.2, x = fake.SNP.BA + .025),method = "lm", col = "blue",linetype = "dashed") +
 
  geom_point(aes(y = blup.3, x = fake.SNP.BA - .025), col = "green") +
  geom_smooth(aes(y = blup.3, x = fake.SNP.BA + .025),method = "lm", col = "green",linetype = "dashed")+ labs(y = "blup value")

ba.snp.plot

and they also appear not to have an assocaition with fake.SNP.BA either. This makes sense because the genet-specific variation contibuting to PGs should not be affected by the BA-associated SNP. However, when you account for all traits in the GWA model and not the BLUP model, you are asking “how is the SNP Genotype associated with average overall genet-specific effect on PGs, after accounting for individuals’ traits”. We are comparing a variation-less value (calculating the BLUP removes within-genet variation) to one that still contains variation:

small.dat <- data %>% filter(Genet.SSRrev %in% levels(data$Genet.SSRrev)[1:10])

ggplot(data = small.dat, aes(x = BAsqrt, y = blup.3, fill = Genet.SSRrev)) +
  geom_point(aes(col = Genet.SSRrev))

This means that, even though the blup.3 value for all of Genet 10 are the same, they are standardized by their highly variable BAsqrt in the gwa.3 multivariate gwa model.

I truly don’t understand the genetics enough to know if this is a valid method but it seems wrong to me on the surface. Perhaps if we performed GWA on averaged (i.e. only one piece of data for each Genet) trait covariates. I still think it needs to be tested though, since I don’t fully understand.

Mixed Model GWA:

Part of the reason the mixed-effects GWA models I’ve been working on are nice is that they account for everything in one step, on an individual basis:

new.form <- update(formula(fm3), . ~ . + fake.SNP.BA)

big.1 <- lmer(data = data, update(formula(fm1), . ~ . + fake.SNP.PG + fake.SNP.BA))
big.2 <- lmer(data = data, update(formula(fm2), . ~ . + fake.SNP.PG + fake.SNP.BA))
big.3 <- lmer(data = data, update(formula(fm3), . ~ . + fake.SNP.PG + fake.SNP.BA))

car::Anova(big.1)
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: PG
##               Chisq Df Pr(>Chisq)    
## Block        7.1778  3    0.06644 .  
## Dist.Edge    2.4481  1    0.11767    
## Age          4.0254  1    0.04482 *  
## fake.SNP.PG 79.0917  1    < 2e-16 ***
## fake.SNP.BA  0.0003  1    0.98548    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::Anova(big.2)
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: PG
##               Chisq Df Pr(>Chisq)    
## Block        7.5450  3   0.056414 .  
## Dist.Edge    5.1879  1   0.022745 *  
## Age          7.1354  1   0.007558 ** 
## BAsqrt      35.8133  1  2.172e-09 ***
## fake.SNP.PG 77.6984  1  < 2.2e-16 ***
## fake.SNP.BA  0.0324  1   0.857197    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::Anova(big.3)
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: PG
##                 Chisq Df Pr(>Chisq)    
## Block         32.9117  3  3.362e-07 ***
## Dist.Edge      6.8424  1   0.008902 ** 
## Age            4.5585  1   0.032756 *  
## BAsqrt        48.0999  1  4.050e-12 ***
## CN            57.1356  1  4.068e-14 ***
## SLA           55.9614  1  7.391e-14 ***
## ALA           42.4409  1  7.285e-11 ***
## CT           309.9667  1  < 2.2e-16 ***
## BBreakDegDay  34.6998  1  3.847e-09 ***
## Npct          26.4987  1  2.637e-07 ***
## fake.SNP.PG   69.9729  1  < 2.2e-16 ***
## fake.SNP.BA    0.2370  1   0.626347    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

All 3 of our models have the same conclusions when we use this method.