This document is simply to setup the R objects in the form of *.RData
files. This includes the compilation of all data into 1 large data set
In the .html
versions of this document, you can click on the boxes marked “code” to display the code that created each non-text section.
This section contains code for the initial setup such as loading packages and importing the data. Therefore, this section can be skipped in most cases.
Note that all code is executed from the project root directory.
# load packages
require(dplyr, quietly = TRUE) # for %>%
require(lubridate, quietly = TRUE) # for date manipulation
library(doBy)
# read in the insect survey data: this data set has all individuals, including
## dead trees. The data areh also displayed as raw counts, i.e. not cpue.
## This is based off of the conversations with the statistical consulting
## advisor in 2018. - The data are most indicative of a zero-inflated poisson
## distribution.
insect.surveys <- read.csv(file = "data/insect-surveys.csv",header = TRUE,
strip.white = TRUE, na.strings = c("","NA"))
# set columns to appropriate type
insect.surveys <- within(insect.surveys, {
# Change to from numeric to factor
Block = factor(Block) # block identifier
Position = factor(Position) # column in plot
SerialNo = factor(SerialNo) # unique tree identifier
Survey.Year = factor(Survey.Year) # Year of ovservation
# reorder levels appropriately
survey.event = factor(survey.event, # survey event ID
levels = c("jun16","aug16","jun17","aug17"))
Survey.Month = factor(Survey.Month, # calendar month of observation
levels = c("Jun","Jul","Aug"))
# add correct date format
Date = as.Date(Date)
# change from factor to character
Notes = as.character(Notes)
# remove empty columns - initially unkown galls, later removed
Pelican.Beak.Gall = NULL
Open.Brown.Leaf.Gall = NULL
})
# remove trees that were not part of the surveys (border trees)
insect.surveys <- insect.surveys %>% filter(!is.na(Min.per.Tree))
# read in column meta data (already as factors)
metadata.insects <- read.csv("data/insect-col-metadata.csv", na.strings = c("","NA","<NA>"),
stringsAsFactors = TRUE,strip.white = TRUE)
# 2016
weather.2016 <- read.csv(file = "data/weather/2016_weather.csv",
na.strings = "", strip.white = TRUE)
# 2017
weather.2017 <- read.csv(file = "data/weather/2017_weather.csv",
na.strings = "", strip.white = TRUE)
# check that the columns are the same
stopifnot(all(names(weather.2016) == names(weather.2017)))
# combine them into 1 data set
weather.data <- rbind(weather.2016,weather.2017)
weather.data <- within(weather.data,{
# change the "T" in precip. to NA
avg.precip_in = ifelse(weather.data$avg.precip_in == "T",
yes = NA,
no = as.numeric(as.character(
weather.data$avg.precip_in)))
# formate the date column
date = as.Date(dmy(date))
# break the 'events' column into 3 flag columns: fog, rain, thunder
fog.event = as.factor(ifelse(is.na(events),yes = NA,
no = ifelse(grepl(events,pattern = "Fog"),yes = 1,no = 0)
))
rain.event = as.factor(ifelse(is.na(events),yes = NA,
no = ifelse(grepl(events,pattern = "Rain"),yes = 1,no = 0)
))
thunderstorm.event = as.factor(ifelse(is.na(events),yes = NA,
no = ifelse(grepl(events,pattern = "Thunderstorm"),
yes = 1,no = 0)
))
events = NULL
})
# remove single year data objects
# rm(list = c("weather.2016","weather.2017"))
# # Read in the phenotype data table. This file is created and maintained by
# ## Chris Cole. This file should be kept updated in the directory.
# trait.data <- read.csv(file = "data/trait-data.csv",strip.white = TRUE,
# na.strings = c("","NA"))
# # select only relevent columns
# ## important. Then comment columns will be removed
# keep.cols <- grep(names(trait.data), value = TRUE,
# pattern = "SerialNo|.*1(6|7|8)|PlantingDate|Sex.Genet|
# ^ExclusionFlag$|Latitude|Longitude|Dist.Edge|Ploidy",
# perl = TRUE) %>%
# ## remove comment columns
# grep(pattern = "Comment",invert = TRUE, value = TRUE)
# # reassign the data
# trait.data <- trait.data %>% select(keep.cols)
#
# # fix column types
# trait.data <- within(trait.data,{
# SerialNo = factor(SerialNo, levels = levels(insect.surveys$SerialNo))
# Ploidy = factor(Ploidy)
# BBreakDegDay.2016 = as.numeric(as.character(BBreakDegDay.2016))
# })
#
# # remove 'keep.cols' object
# # rm(list = "keep.cols")
trait.data <- read.csv(file = "data/trait-data-long.csv", header = TRUE, strip.white = TRUE, na.strings = c("","NA"))
trait.data <- within(trait.data,{
SerialNo <- factor(SerialNo)
Block <- factor(Block)
year.proj <- factor(year.proj)
})
trait.data <- trait.data %>% filter(year %in% c(2016,2017), month != "avg") %>% select(-c(Tree.ID, Block, Row, Position))
total.spp <- nrow(metadata.insects %>%
filter(Classification == "insect species",
R.Columns %in% names(insect.surveys)))
In all 4 surveys, we identified 151 insect species in total. Some of those species were only found in some surveys and others were grouped according to family in the analyses.
# add up all insect columns
all.insect.cols <- (metadata.insects %>%
# This filter selects all insect columns, including
## grouped insects (leafhoppers, ants, sawflies,...),
## and insect sign (leaf rolls, ...)
filter(Col.Type == "Insect",
R.Columns != "Unidentified", #remove un ID-d insects
R.Columns != "Silken.X.Cotton.Wood.Leaf.Miner",
R.Columns != "White.Scale",
#remove all ant and leafhopper columns
Family != "Formicidae",
Family != "Cicadellidae",
Family != "Membracidae",
Family != "Delphacidae") %>%
select(R.Columns))[,1] #`[,1]` removes rownames
# remove insects that aren't in the column names because common is used
all.insect.cols <- c(as.vector(all.insect.cols[which(all.insect.cols %in% names(insect.surveys))]),
"Ants", "Leafhoppers") #add back in grouped ants
trees.w.sp <- colSums(insect.surveys[,names(insect.surveys) %in%
all.insect.cols]>0,na.rm = TRUE)
# what proportion of trees does each insect occur on?
sp.prop <- trees.w.sp/nrow(insect.surveys)
# threshold for 'rarity'
thr <- 0.05
# define rare and common insects
rare.insects <- names(sp.prop[sp.prop<thr])
common.insects <- names(sp.prop[sp.prop>=thr])
Of the total 104 insect columns considered, 18 occurred in at least 5% of observations (common insects shown below). These will be the insects considered for the majority of analyses.
print(sort(common.insects))
## [1] "Ants" "Aspen.Leaf.Beetle"
## [3] "Blackmine" "Blotch.Mine"
## [5] "Casebearer.Moth" "Cotton.Scale"
## [7] "Cottonwood.Leaf.Mine" "Green.Aphids"
## [9] "Green.Sawfly" "Harmandia"
## [11] "Leaf.Edge.Mine" "Leafhoppers"
## [13] "Lombardy.Mine" "Pale.Green.Notodontid"
## [15] "Petiole.Gall" "Phyllocolpa"
## [17] "Smokey.Aphids" "Weevil.Mine"
Note that a threshold of 1% would include many more insects than the 5% cut-off that is more standard (18 species in this data set). Some notable insects that would be included are: European snout beetles, Japanese beetles, and a Manduca spp.
It is also important to note that the Leafhoppers
and Ants
columns are groups that contain multiple species from the same Family. These groupings were made because the species members from these groups were not easily determined in the field (but their Family was). For Leafhoppers
, individual species IDs were determined and given their own column, however, the individual species data was inconsistent accross surveys:
# leafhopper columns
leafhoppers <- (metadata.insects %>% filter(
Family %in% c("Cicadellidae",
"Membracidae",
"Delphacidae")) %>%
select(R.Columns))[,1]
# add up all ID-d leafhoppers
LH.IDs <- rowSums(insect.surveys[,names(insect.surveys) %in% leafhoppers])
# Plot the discrepency
layout(matrix(1:2,nrow = 1))
boxplot(insect.surveys$Leafhoppers ~ insect.surveys$survey.event,
main = "Leafhopper\nField counts",ylab = "Leafhoppers",
xlab = "survey event", ylim = c(0,6))
boxplot(LH.IDs~insect.surveys$survey.event,
main = "Leafhoppers\nCollected/Lab IDs",ylab = NULL,
xlab = "survey event", ylim = c(0,6))
Although we observed many more leafhoppers in the field in 2017, fewer were sent to the lab for identification in Jun 2017. This is likely due to the difficulty capturing leafhoppers. They escape easily and often can be tallied but not collected. Therefore, only the grouped Leafhoppers
column will be used. Similarly, time constraints prevented our crew from collecting ants in 2017 and simply identified them as Ants
.
In this section we will calculate some diversity indices for our insect survey data.
# Create a Diversity data frame with the identifer columns from insect surveys
## other this data frame will have new columns with diversity metrics entered
grouped.insects.data <- insect.surveys %>% select(Unique.ID)
We will group the insect columns according to some criteria. This will allow for simpler analysis - \(\approx 5\) response variables as opposed to \(104\). Our 3 grouping methods are all.insects
which includes all columns of identified insects, func.grps
which groups the insect columns according to functional group (Ant, Free.Feeding, Leaf.Modifying, Wood.Modifying), and feed.loc
which groups insects according to if they feed on or within the tree (ecto, endo). For each group, there are 2 variable types _cnt
which is the total number of insects on the tree and _sp.cnt
which counts the number of species observed on the tree.
Note These groupings and summaries do not account for the survey time yet, they are absolute counts.
all.insects <- all.insect.cols # copy and rename all.insect.cols
# Add summary of all insects to diversity table
## complete counts
grouped.insects.data[,"all.insects_cnt"] <- # This column is analagous to richness
rowSums(insect.surveys[,names(insect.surveys) %in% all.insects],
na.rm = TRUE)
## presence data
grouped.insects.data[,"all.insects_sp.cnt"] <-
rowSums(insect.surveys[,names(insect.surveys)%in%all.insects]>0,
na.rm = TRUE)
Here are summary statistics of the insect counts and spp. counts for all insects:
grouped.insects.data %>%
select(all.insects_cnt, all.insects_sp.cnt) %>%
summary() %>% knitr::kable()
all.insects_cnt | all.insects_sp.cnt | |
---|---|---|
Min. : 0.0 | Min. : 0.000 | |
1st Qu.: 7.0 | 1st Qu.: 3.000 | |
Median : 19.0 | Median : 5.000 | |
Mean : 102.6 | Mean : 4.699 | |
3rd Qu.: 63.0 | 3rd Qu.: 6.000 | |
Max. :10522.0 | Max. :14.000 |
Aphids skew the maximum total number of insects counted. Interestingly, no tree had all common insect species present during any survey event.
There are various functional groups at WisAsp as well (Ant, Free.Feeding, Leaf.Modifying, Wood.Modifying) which group insects according to how they feed on the trees.
# add up all insects according to functional group
func.grps <- list() # create empty list to fill
for (FG in levels(metadata.insects$Functional.Group)) {
# create sublist for each functional group and fill it with insects
## from that group
func.grps[[FG]] <- (metadata.insects %>%
# filter according to functional group and col classif.
## 'insect species' ensures only species, not groups
##like 'sawflies' or 'leafhoppers' are present:
filter(Functional.Group == FG,
Classification == "insect species") %>%
select(R.Columns))[,1] #`[,1]` removes rownames
# Add summary of all FGs to diversity table
## complete counts
grouped.insects.data[,paste0(FG,"_cnt")] <-
rowSums(insect.surveys[,names(insect.surveys)%in%func.grps[[FG]]],
na.rm = TRUE)
## presence data
grouped.insects.data[,paste0(FG,"_sp.cnt")] <-
rowSums(insect.surveys[,names(insect.surveys)%in%func.grps[[FG]]]>0,
na.rm = TRUE)
}
# rm(FG) # remove iterator
grouped.insects.data$Ant_sp.cnt <- NULL # remove ant species count column
Finally, the insects can be grouped according to their feeding location. Insects that feed within the wood or leaves are endophages and those that feed externally are ectophages.
# add up all insects according to feeding location
feed.loc <- list() # empty list
for (LOC in levels(metadata.insects$Feeding.Location)) {
# same as w/`func.grps` except with feeding locations
feed.loc[[LOC]] <- (metadata.insects %>%
filter(Feeding.Location == LOC, # see prev. chunk
Classification == "insect species") %>%
select(R.Columns))[,1] #`[,1]` removes rownames
# Add summary of all feeding locations to diversity table
## complete counts
grouped.insects.data[,paste0(LOC,"_cnt")] <-
rowSums(insect.surveys[,names(insect.surveys)%in%feed.loc[[LOC]]],
na.rm = TRUE)
## presence data
grouped.insects.data[,paste0(LOC,"_sp.cnt")] <-
rowSums(insect.surveys[,names(insect.surveys)%in%feed.loc[[LOC]]]>0,
na.rm = TRUE)
}
# rm(LOC) # remove iterator
Here We’ll combine all the data we’ve built so far into 1 data frame
# Add in Other Data Types
## merge in grouped insect data
full.data <- merge(insect.surveys,
grouped.insects.data,by = c("Unique.ID"),
all.x = TRUE)
### check that the merge worked
stopifnot(nrow(full.data) == nrow(insect.surveys))
## make full.data and trait.data year and month columns equal
full.data$Survey.Month <- factor(ifelse(full.data$survey.event %in% c("aug16","aug17"), yes = "Aug", no = as.character(full.data$Survey.Month)))
## next merge in the trait data
full.data <- merge(full.data,
trait.data,
by.x = c("SerialNo","Survey.Year","Survey.Month"),
by.y = c("SerialNo","year.proj","month"),
all.x = TRUE)
### check that the merge worked
stopifnot(nrow(full.data) == nrow(insect.surveys))
## next merge in weather data
### first check that the dates in the insect survey frame all occur
## in the weather data
stopifnot(all(insect.surveys$Date %in% weather.data$date))
### then merge
full.data <- merge(full.data, weather.data,by.x = "Date", by.y = "date",all.x = TRUE)
### final check
stopifnot(nrow(full.data) == nrow(insect.surveys))
# # Fix Data that did not merge correctly
# ## nested ifelse() statements to assign trait values according to survey.event.
# ## PGs
# full.data$PGsum <- ifelse(
# test = full.data$survey.event == "aug16",
# yes = full.data$PGsum.A2016,
# no = ifelse(
# test = full.data$survey.event == "jun16",
# yes = full.data$PGsum.J2016,
# no = ifelse(
# test = full.data$survey.event == "aug17",
# yes = full.data$PGsum.A2017,
# no = full.data$PGsum.J2017.JenaEQ)))
# ## CTs
# full.data$CTsum <- ifelse(
# test = full.data$survey.event == "aug16",
# yes = full.data$CT.A2016,
# no = ifelse(
# test = full.data$survey.event == "jun16",
# yes = full.data$CT.J2016,
# no = ifelse(
# test = full.data$survey.event == "aug17",
# yes = full.data$CT.A2017,
# no = full.data$CT.J2017)))
# ## Volume
# full.data$Volume <- ifelse(
# test = full.data$Survey.Year == "2016",
# yes = full.data$Vol.2016,
# no = full.data$Vol.2017)
#
# ## BA
# full.data$BA.all <- ifelse(
# test = full.data$Survey.Year == "2016",
# yes = full.data$BA.2016,
# no = full.data$BA.2017)
#
# # Still Needed
# ## CN
# ### C
# full.data$Cpct.all <- ifelse(
# test = full.data$survey.event == "aug16",
# yes = full.data$Cpct.A2016,
# no = ifelse(
# test = full.data$survey.event == "jun16",
# yes = full.data$Cpct.J2016,
# no = ifelse(
# test = full.data$survey.event == "aug17",
# yes = full.data$Cpct.A2017,
# no = full.data$Cpct.J2017)))
# ### N
# full.data$Npct.all <- ifelse(
# test = full.data$survey.event == "aug16",
# yes = full.data$Npct.A2016,
# no = ifelse(
# test = full.data$survey.event == "jun16",
# yes = full.data$Npct.J2016,
# no = ifelse(
# test = full.data$survey.event == "aug17",
# yes = full.data$Npct.A2017,
# no = full.data$Npct.J2017)))
# ## BudBreak
# full.data$BBreakDegDay.all <- ifelse(
# test = full.data$Survey.Year == "2016",
# yes = full.data$BBreakDegDay.2016,
# no = full.data$BBreakDegDay.2017)
# ## Leaf Area
# ### ALA
# full.data$ALA.all <- ifelse(
# test = full.data$survey.event == "aug16",
# yes = full.data$ALA.A2016,
# no = ifelse(
# test = full.data$survey.event == "jun16",
# yes = full.data$ALA.J2016,
# no = ifelse(
# test = full.data$survey.event == "aug17",
# yes = full.data$ALA.A2017,
# no = full.data$ALA.J2017)))
# ### SLA
# full.data$SLA.all <- ifelse(
# test = full.data$survey.event == "aug16",
# yes = full.data$SLA.A2016,
# no = ifelse(
# test = full.data$survey.event == "jun16",
# yes = full.data$SLA.J2016,
# no = ifelse(
# test = full.data$survey.event == "aug17",
# yes = full.data$SLA.A2017,
# no = full.data$SLA.J2017)))
#
# ## EFNs
# full.data$EFNMean.all <- ifelse(
# test = full.data$Survey.Year == "2016",
# yes = full.data$EFNMean.2016,
# no = full.data$EFNMean.2017)
#
# # check changes
# full.data %>% select(SLA.all,ALA.all,BBreakDegDay.all,
# Npct.all,Cpct.all,Volume,PGsum,CTsum,
# survey.event) %>% group_by(survey.event) %>% summary()
# Remove trees that were surveyed for 0 minites (or NA)
filtered.data <- full.data %>% filter(Min.per.Tree > 0)
## Which observations were removed?
removed.rows <- full.data %>%
filter(!Unique.ID %in% filtered.data$Unique.ID) %>%
select(Unique.ID,SerialNo,Min.per.Tree)
## which trees were removed and how many?
removed.trees <- unique(removed.rows$SerialNo)
# rm(list = c("i","new","old")) # remove un-needed objects
save(list = c("insect.surveys",
"metadata.insects",
"trait.data",
"weather.data",
"grouped.insects.data",
"full.data",
"common.insects",
"rare.insects",
"all.insects",
"filtered.data"),
file = "data/r-objects.RData",
compress = FALSE)