In this document, I will test the effectiveness of 3 different ordination techniques t-SNE, a relatively new ordination technique, and NMDS for my data set.
# load data
data("r-objects")
# set random seed (for reproducibility)
set.seed(params$seed)
# get list of genets to use
if(params$tsne.genet.n == "all"){
genet.list <- unique(levels(full.data$Genet))
}else{
genet.list <- sample(levels(full.data$Genet),
size = params$tsne.genet.n)
}
# filter and subset the data
data <- full.data %>% dplyr::select(Unique.ID,
Genet,
Min.per.Tree,
all.insects,
survey.event,
Block,Row,Position) %>%
filter(Genet %in% genet.list,
Min.per.Tree > 0
) %>%
na.omit()
# one more filter to remove rows with no insects
## no insects will all have dissimilarities of 0
data <- data[which(rowSums(data[,common.insects]) > 0),]
# make the density data
data.dens <- data[,all.insects]/data[,"Min.per.Tree"]
# set the colors for plotting
colors <- rainbow(length(unique(data$Genet)))
names(colors) <- unique(data$Genet)
First, I’ll look at t-SNE (t-distributed stochastic neighbor embedding) which is a relatively new ordination method that is very good at clumping like things together and different things appart.
First, we will look at all survey events and see how tsne separates our communities.
# build the tsne for our data subset
if(params$run.tsne){
all.ins.tsne <- Rtsne(X = data.dens,
dims = 2,
num_threads = 4,
check_duplicates = FALSE,
normalize = TRUE, max_iter = 1000)
}else{load("data/ordination-comparison.RData")}
set.seed(params$seed) # set seed
# extract a subset of genets to see how they're grouped
genet.subset <- sample(data$Genet,size = 7) # pick genets
subset.dummy <- data$Genet[match(data$Genet,genet.subset)]
cols <- c("red","orange","yellow","green","purple","brown","blue")
# plot the data points
plot(all.ins.tsne$Y, main = "t-SNE: all insects", xlab = "tsne1",ylab="tsne2")
points(all.ins.tsne$Y, col = cols[factor(subset.dummy)],add = TRUE,pch = 20)
# legend(x = "bottomright",
# legend = genet.subset,
# col = cols,
# pch = 20,
# bty = "o"
# )
The above plot shows us that, when all the data are considered, t-SNE is shows good separation. However, the separation of communities does not seem to be strongly genet associated. a small sample of 7 genets are shown in color. We see more separation of individuals of a genet than we would expect if they were very similar in community composition.
Next let’s see how survey event separates out.
plot(all.ins.tsne$Y,col = data$survey.event,pch = 20,
main = "t-SNE: all insects", xlab = "tsne1",ylab="tsne2")
legend(x = "bottomright",
legend = unique(data$survey.event),
col = unique(data$survey.event),
pch = 20,bty = "n")
We see from the above that survey event is really what is driving the differences in our communities. We even see that early season surveys have communities that are more than those occuring in late summer.
Now let’s look at just the common insects instead of all of them
comm.dens <- data.dens %>% dplyr::select(common.insects)
if(params$run.tsne){comm.ins.tsne <- Rtsne(X = comm.dens,
dims = 2,
num_threads = 4,
check_duplicates = FALSE,
normalize = TRUE,
max_iter = 1000)}
# plot the data points
plot(comm.ins.tsne$Y,
main = "t-SNE: common insects", xlab = "tsne1",ylab="tsne2")
points(comm.ins.tsne$Y, col = cols[factor(subset.dummy)],add = TRUE,pch = 20)
plot(comm.ins.tsne$Y,col = data$survey.event,pch = 20,
main = "t-SNE: common insects", xlab = "tsne1",ylab="tsne2")
legend(x = "topright",
legend = unique(data$survey.event),
col = unique(data$survey.event),
pch = 20,bty = "n")
We can see that when only common insects are considered, the same patterns emerge as above (just rotated 90 degrees).
set.seed(params$seed)
aug17.dens <- data.dens %>% filter(data$survey.event == "aug17")
if(params$run.tsne){aug17.ins.tsne <- Rtsne(X = aug17.dens,
dims = 2,
num_threads = 4,
check_duplicates = FALSE,
normalize = TRUE,
max_iter = 1000)}
# plot the data points
plot(aug17.ins.tsne$Y,
main = "t-SNE: common insects", xlab = "tsne1",ylab="tsne2")
points(aug17.ins.tsne$Y, col = cols[factor(subset.dummy)],add = TRUE,pch = 20)
Now it seems that our points may be more similar to other members of the same genet than we thought, though clusters are not well defined.
mds <- mds.obs$MDS$comm.mds$all
grp.mds <- mds.obs$MDS$grp.mds$all
group <- mds.obs$data$groups
ordiplot(mds)
ordiplot(mds, type = "n")
ordihull(ord = mds,
groups = group,
draw = "polygon",
col = "grey90",
label = F)
mds.event <- list()
grp.mds.event <- list()
layout(matrix(1:2,nrow = 1))
for(mat in names((mds.obs.event$MDS$comm.mds))){
mds.event[[mat]] <- mds.obs.event$MDS$comm.mds[[mat]]
grp.mds.event[[mat]] <- mds.obs.event$MDS$grp.mds[[mat]]
group <- mds.obs.event$data$groups[[mat]]
ordiplot(mds.event[[mat]], main = paste("Tree communities:",mat))
ordiplot(mds.event[[mat]], type = "n", main = paste("Genet communities:",mat))
ordihull(ord = mds.event[[mat]],
groups = group,
draw = "polygon",
col = "grey90",
label = F)
}
# # ls()
save(list = c("all.ins.tsne","comm.ins.tsne",
"aug17.ins.tsne",
"mds","grp.mds","mds.event",
"grp.mds.event","mds.obs.event"),
file = "data/ordination-comparison.RData")