Intro
Setup
- Insect Surveys
Identify Common Insects
Community Diversity Metrics
- Insect Grouping
- Create Full Data set
Save Objects

Intro

This document is simply to setup the R objects in the form of *.RData files. This includes the compilation of all data into 1 large data set

In the .html versions of this document, you can click on the boxes marked “code” to display the code that created each non-text section.

Setup

This section contains code for the initial setup such as loading packages and importing the data. Therefore, this section can be skipped in most cases.

Note that all code is executed from the project root directory.

# load packages
require(dplyr, quietly = TRUE) # for %>% 
require(lubridate, quietly = TRUE) # for date manipulation
library(doBy)

Insect Surveys

Insect Data

# read in the insect survey data: this data set has all individuals, including
  ## dead trees. The data areh also displayed as raw counts, i.e. not cpue.
  ## This is based off of the conversations with the statistical consulting
  ## advisor in 2018. - The data are most indicative of a zero-inflated poisson
  ## distribution.
insect.surveys <- read.csv(file = "data/insect-surveys.csv",header = TRUE,
                           strip.white = TRUE, na.strings = c("","NA"))

# set columns to appropriate type
insect.surveys <- within(insect.surveys, {
  # Change to from numeric to factor
  Block = factor(Block) # block identifier
  Position = factor(Position) # column in plot
  SerialNo = factor(SerialNo) # unique tree identifier
  Survey.Year = factor(Survey.Year) # Year of ovservation
  # reorder levels appropriately
  survey.event = factor(survey.event, # survey event ID
                         levels = c("jun16","aug16","jun17","aug17"))
  Survey.Month = factor(Survey.Month, # calendar month of observation
                         levels = c("Jun","Jul","Aug"))
  # add correct date format
  Date = as.Date(Date)
  # change from factor to character
  Notes = as.character(Notes)
  # remove empty columns - initially unkown galls, later removed
  Pelican.Beak.Gall = NULL
  Open.Brown.Leaf.Gall = NULL
})

# remove trees that were not part of the surveys (border trees)
insect.surveys <- insect.surveys %>% filter(!is.na(Min.per.Tree))

insect meta-data

# read in column meta data (already as factors)
metadata.insects <- read.csv("data/insect-col-metadata.csv", na.strings = c("","NA","<NA>"),
                             stringsAsFactors = TRUE,strip.white = TRUE)

Weather Data

# 2016
weather.2016 <- read.csv(file = "data/weather/2016_weather.csv", 
                         na.strings = "", strip.white = TRUE)
# 2017
weather.2017 <- read.csv(file = "data/weather/2017_weather.csv", 
                         na.strings = "", strip.white = TRUE)
# check that the columns are the same
stopifnot(all(names(weather.2016) == names(weather.2017)))
# combine them into 1 data set
weather.data <- rbind(weather.2016,weather.2017)

weather.data <- within(weather.data,{
  # change the "T" in precip. to NA
  avg.precip_in = ifelse(weather.data$avg.precip_in == "T",
                         yes =  NA,
                         no = as.numeric(as.character(
                           weather.data$avg.precip_in)))
  # formate the date column
  date = as.Date(dmy(date))
  # break the 'events' column into 3 flag columns: fog, rain, thunder
  fog.event = as.factor(ifelse(is.na(events),yes = NA,
               no = ifelse(grepl(events,pattern = "Fog"),yes = 1,no = 0)
               ))
  rain.event = as.factor(ifelse(is.na(events),yes = NA,
               no = ifelse(grepl(events,pattern = "Rain"),yes = 1,no = 0)
               ))
  thunderstorm.event = as.factor(ifelse(is.na(events),yes = NA,
               no = ifelse(grepl(events,pattern = "Thunderstorm"),
                           yes = 1,no = 0)
               ))
  events = NULL
})
# remove single year data objects
# rm(list = c("weather.2016","weather.2017"))

Phenotype Data

# # Read in the phenotype data table. This file is created and maintained by
#   ## Chris Cole. This file should be kept updated in the directory. 
# trait.data <- read.csv(file = "data/trait-data.csv",strip.white = TRUE,
#                        na.strings = c("","NA"))

# # select only relevent columns
#   ## important. Then comment columns will be removed
# keep.cols <- grep(names(trait.data), value = TRUE,
#      pattern = "SerialNo|.*1(6|7|8)|PlantingDate|Sex.Genet|
#      ^ExclusionFlag$|Latitude|Longitude|Dist.Edge|Ploidy",
#      perl = TRUE) %>% 
#   ## remove comment columns
#   grep(pattern = "Comment",invert = TRUE, value = TRUE)
# # reassign the data
# trait.data <- trait.data %>% select(keep.cols)
# 
# # fix column types
# trait.data <- within(trait.data,{
#   SerialNo = factor(SerialNo, levels = levels(insect.surveys$SerialNo))
#   Ploidy = factor(Ploidy)
#   BBreakDegDay.2016 = as.numeric(as.character(BBreakDegDay.2016))
# })
# 
# # remove 'keep.cols' object
# # rm(list = "keep.cols")

Long Phenotype Data

trait.data <- read.csv(file = "data/trait-data-long.csv", header = TRUE, strip.white = TRUE, na.strings = c("","NA"))
trait.data <- within(trait.data,{
  SerialNo <- factor(SerialNo)
  Block <- factor(Block)
  year.proj <- factor(year.proj)
})
trait.data <- trait.data %>% filter(year %in% c(2016,2017), month != "avg") %>% select(-c(Tree.ID, Block, Row, Position))

Identify Common Insects

total.spp <- nrow(metadata.insects %>% 
                      filter(Classification == "insect species", 
                             R.Columns %in% names(insect.surveys)))

In all 4 surveys, we identified 151 insect species in total. Some of those species were only found in some surveys and others were grouped according to family in the analyses.

# add up all insect columns 
all.insect.cols <- (metadata.insects %>% 
                  # This filter selects all insect columns, including
                    ## grouped insects (leafhoppers, ants, sawflies,...),
                    ## and insect sign (leaf rolls, ...)
                  filter(Col.Type == "Insect", 
                         R.Columns != "Unidentified", #remove un ID-d insects
                         R.Columns != "Silken.X.Cotton.Wood.Leaf.Miner",
                         R.Columns != "White.Scale",
                         #remove all ant and leafhopper columns
                         Family !=  "Formicidae", 
                         Family !=  "Cicadellidae",
                         Family !=  "Membracidae",
                         Family !=  "Delphacidae") %>% 
                  select(R.Columns))[,1] #`[,1]` removes rownames
# remove insects that aren't in the column names because common is used
all.insect.cols <- c(as.vector(all.insect.cols[which(all.insect.cols %in% names(insect.surveys))]),
                     "Ants", "Leafhoppers") #add back in grouped ants

trees.w.sp <- colSums(insect.surveys[,names(insect.surveys) %in% 
                                       all.insect.cols]>0,na.rm = TRUE)
# what proportion of trees does each insect occur on?
sp.prop <- trees.w.sp/nrow(insect.surveys)

# threshold for 'rarity'
thr <- 0.05 

# define rare and common insects
rare.insects <- names(sp.prop[sp.prop<thr])
common.insects <- names(sp.prop[sp.prop>=thr])

Of the total 104 insect columns considered, 18 occurred in at least 5% of observations (common insects shown below). These will be the insects considered for the majority of analyses.

print(sort(common.insects))

##  [1] "Ants"                  "Aspen.Leaf.Beetle"    
##  [3] "Blackmine"             "Blotch.Mine"          
##  [5] "Casebearer.Moth"       "Cotton.Scale"         
##  [7] "Cottonwood.Leaf.Mine"  "Green.Aphids"         
##  [9] "Green.Sawfly"          "Harmandia"            
## [11] "Leaf.Edge.Mine"        "Leafhoppers"          
## [13] "Lombardy.Mine"         "Pale.Green.Notodontid"
## [15] "Petiole.Gall"          "Phyllocolpa"          
## [17] "Smokey.Aphids"         "Weevil.Mine"

Note that a threshold of 1% would include many more insects than the 5% cut-off that is more standard (18 species in this data set). Some notable insects that would be included are: European snout beetles, Japanese beetles, and a Manduca spp.

It is also important to note that the Leafhoppers and Ants columns are groups that contain multiple species from the same Family. These groupings were made because the species members from these groups were not easily determined in the field (but their Family was). For Leafhoppers, individual species IDs were determined and given their own column, however, the individual species data was inconsistent accross surveys:

# leafhopper columns
leafhoppers <- (metadata.insects %>% filter(
                         Family %in% c("Cicadellidae",
                         "Membracidae",
                         "Delphacidae")) %>% 
                select(R.Columns))[,1]
# add up all ID-d leafhoppers
LH.IDs <- rowSums(insect.surveys[,names(insect.surveys) %in% leafhoppers])
# Plot the discrepency
layout(matrix(1:2,nrow = 1))
boxplot(insect.surveys$Leafhoppers ~ insect.surveys$survey.event, 
     main = "Leafhopper\nField counts",ylab = "Leafhoppers", 
     xlab = "survey event", ylim = c(0,6))
boxplot(LH.IDs~insect.surveys$survey.event, 
     main = "Leafhoppers\nCollected/Lab IDs",ylab = NULL, 
     xlab = "survey event", ylim = c(0,6))

Although we observed many more leafhoppers in the field in 2017, fewer were sent to the lab for identification in Jun 2017. This is likely due to the difficulty capturing leafhoppers. They escape easily and often can be tallied but not collected. Therefore, only the grouped Leafhoppers column will be used. Similarly, time constraints prevented our crew from collecting ants in 2017 and simply identified them as Ants.

Community Diversity Metrics

In this section we will calculate some diversity indices for our insect survey data.

# Create a Diversity data frame with the identifer columns from insect surveys
  ## other this data frame will have new columns with diversity metrics entered
grouped.insects.data <- insect.surveys %>% select(Unique.ID)

Insect Grouping

We will group the insect columns according to some criteria. This will allow for simpler analysis - \(\approx 5\) response variables as opposed to \(104\). Our 3 grouping methods are all.insects which includes all columns of identified insects, func.grps which groups the insect columns according to functional group (Ant, Free.Feeding, Leaf.Modifying, Wood.Modifying), and feed.loc which groups insects according to if they feed on or within the tree (ecto, endo). For each group, there are 2 variable types _cnt which is the total number of insects on the tree and _sp.cnt which counts the number of species observed on the tree.

Note These groupings and summaries do not account for the survey time yet, they are absolute counts.

All Insects

all.insects <- all.insect.cols # copy and rename all.insect.cols

# Add summary of all insects to diversity table
## complete counts
grouped.insects.data[,"all.insects_cnt"] <- # This column is analagous to richness
  rowSums(insect.surveys[,names(insect.surveys) %in% all.insects],
          na.rm = TRUE)
## presence data
grouped.insects.data[,"all.insects_sp.cnt"] <-
rowSums(insect.surveys[,names(insect.surveys)%in%all.insects]>0,
        na.rm = TRUE)

Here are summary statistics of the insect counts and spp. counts for all insects:

grouped.insects.data %>% 
               select(all.insects_cnt, all.insects_sp.cnt) %>% 
               summary() %>% knitr::kable()

	all.insects_cnt	all.insects_sp.cnt
	Min. : 0.0	Min. : 0.000
	1st Qu.: 7.0	1st Qu.: 3.000
	Median : 19.0	Median : 5.000
	Mean : 102.6	Mean : 4.699
	3rd Qu.: 63.0	3rd Qu.: 6.000
	Max. :10522.0	Max. :14.000

Aphids skew the maximum total number of insects counted. Interestingly, no tree had all common insect species present during any survey event.

Functional Groups

There are various functional groups at WisAsp as well (Ant, Free.Feeding, Leaf.Modifying, Wood.Modifying) which group insects according to how they feed on the trees.

# add up all insects according to functional group
func.grps <- list() # create empty list to fill
for (FG in levels(metadata.insects$Functional.Group)) {
  # create sublist for each functional group and fill it with insects
    ## from that group 
  func.grps[[FG]] <- (metadata.insects %>% 
                        # filter according to functional group and col classif.
                        ## 'insect species' ensures only species, not groups 
                          ##like 'sawflies' or 'leafhoppers' are present:
                        filter(Functional.Group == FG,
                               Classification == "insect species") %>%
                        select(R.Columns))[,1] #`[,1]` removes rownames

  # Add summary of all FGs to diversity table    
  ## complete counts
  grouped.insects.data[,paste0(FG,"_cnt")] <-
    rowSums(insect.surveys[,names(insect.surveys)%in%func.grps[[FG]]],
            na.rm = TRUE)
  ## presence data
  grouped.insects.data[,paste0(FG,"_sp.cnt")] <-
  rowSums(insect.surveys[,names(insect.surveys)%in%func.grps[[FG]]]>0,
          na.rm = TRUE)
}
# rm(FG) # remove iterator
grouped.insects.data$Ant_sp.cnt <- NULL # remove ant species count column

Internal or External Feeders

Finally, the insects can be grouped according to their feeding location. Insects that feed within the wood or leaves are endophages and those that feed externally are ectophages.

# add up all insects according to feeding location
feed.loc <- list() # empty list
for (LOC in levels(metadata.insects$Feeding.Location)) {
  # same as w/`func.grps` except with feeding locations
  feed.loc[[LOC]] <- (metadata.insects %>% 
                        filter(Feeding.Location == LOC, # see prev. chunk
                               Classification == "insect species") %>%
                      select(R.Columns))[,1] #`[,1]` removes rownames
  
  # Add summary of all feeding locations to diversity table    
  ## complete counts
  grouped.insects.data[,paste0(LOC,"_cnt")] <-
    rowSums(insect.surveys[,names(insect.surveys)%in%feed.loc[[LOC]]],
            na.rm = TRUE)
  ## presence data
  grouped.insects.data[,paste0(LOC,"_sp.cnt")] <-
  rowSums(insect.surveys[,names(insect.surveys)%in%feed.loc[[LOC]]]>0,
          na.rm = TRUE)
}
# rm(LOC) # remove iterator

Create Full Data set

Here We’ll combine all the data we’ve built so far into 1 data frame

# Add in Other Data Types
## merge in grouped insect data
full.data <- merge(insect.surveys,
                   grouped.insects.data,by = c("Unique.ID"), 
                   all.x = TRUE)
### check that the merge worked
stopifnot(nrow(full.data) == nrow(insect.surveys))

## make full.data and trait.data year and month columns equal
full.data$Survey.Month <- factor(ifelse(full.data$survey.event %in% c("aug16","aug17"), yes = "Aug", no = as.character(full.data$Survey.Month)))

## next merge in the trait data
full.data <- merge(full.data,
                   trait.data,
                   by.x = c("SerialNo","Survey.Year","Survey.Month"),
                   by.y = c("SerialNo","year.proj","month"),
                   all.x = TRUE)
### check that the merge worked
stopifnot(nrow(full.data) == nrow(insect.surveys))
## next merge in weather data
### first check that the dates in the insect survey frame all occur
  ## in the weather data
stopifnot(all(insect.surveys$Date %in% weather.data$date))
### then merge
full.data <- merge(full.data, weather.data,by.x = "Date", by.y = "date",all.x = TRUE)
### final check
stopifnot(nrow(full.data) == nrow(insect.surveys))

# # Fix Data that did not merge correctly
#   ## nested ifelse() statements to assign trait values according to survey.event.
# ## PGs
# full.data$PGsum <- ifelse(
#   test = full.data$survey.event == "aug16",
#   yes = full.data$PGsum.A2016,
#   no = ifelse(
#     test = full.data$survey.event == "jun16",
#     yes = full.data$PGsum.J2016,
#     no = ifelse(
#       test = full.data$survey.event == "aug17",
#       yes = full.data$PGsum.A2017,
#       no = full.data$PGsum.J2017.JenaEQ)))
# ## CTs
# full.data$CTsum <- ifelse(
#   test = full.data$survey.event == "aug16",
#   yes = full.data$CT.A2016,
#   no = ifelse(
#     test = full.data$survey.event == "jun16",
#     yes = full.data$CT.J2016,
#     no = ifelse(
#       test = full.data$survey.event == "aug17",
#       yes = full.data$CT.A2017,
#       no = full.data$CT.J2017)))
# ## Volume
# full.data$Volume <- ifelse(
#   test = full.data$Survey.Year == "2016",
#   yes = full.data$Vol.2016,
#   no = full.data$Vol.2017)
# 
# ## BA
# full.data$BA.all <- ifelse(
#   test = full.data$Survey.Year == "2016",
#   yes = full.data$BA.2016,
#   no = full.data$BA.2017)
# 
# # Still Needed
# ## CN
# ### C
# full.data$Cpct.all <- ifelse(
#   test = full.data$survey.event == "aug16",
#   yes = full.data$Cpct.A2016,
#   no = ifelse(
#     test = full.data$survey.event == "jun16",
#     yes = full.data$Cpct.J2016,
#     no = ifelse(
#       test = full.data$survey.event == "aug17",
#       yes = full.data$Cpct.A2017,
#       no = full.data$Cpct.J2017)))
# ### N
# full.data$Npct.all <- ifelse(
#   test = full.data$survey.event == "aug16",
#   yes = full.data$Npct.A2016,
#   no = ifelse(
#     test = full.data$survey.event == "jun16",
#     yes = full.data$Npct.J2016,
#     no = ifelse(
#       test = full.data$survey.event == "aug17",
#       yes = full.data$Npct.A2017,
#       no = full.data$Npct.J2017)))
# ## BudBreak
# full.data$BBreakDegDay.all <- ifelse(
#   test = full.data$Survey.Year == "2016",
#   yes = full.data$BBreakDegDay.2016,
#   no = full.data$BBreakDegDay.2017)
# ## Leaf Area
# ### ALA
# full.data$ALA.all <- ifelse(
#   test = full.data$survey.event == "aug16",
#   yes = full.data$ALA.A2016,
#   no = ifelse(
#     test = full.data$survey.event == "jun16",
#     yes = full.data$ALA.J2016,
#     no = ifelse(
#       test = full.data$survey.event == "aug17",
#       yes = full.data$ALA.A2017,
#       no = full.data$ALA.J2017)))
# ### SLA
# full.data$SLA.all <- ifelse(
#   test = full.data$survey.event == "aug16",
#   yes = full.data$SLA.A2016,
#   no = ifelse(
#     test = full.data$survey.event == "jun16",
#     yes = full.data$SLA.J2016,
#     no = ifelse(
#       test = full.data$survey.event == "aug17",
#       yes = full.data$SLA.A2017,
#       no = full.data$SLA.J2017)))
# 
# ## EFNs
# full.data$EFNMean.all <- ifelse(
#   test = full.data$Survey.Year == "2016",
#   yes = full.data$EFNMean.2016,
#   no = full.data$EFNMean.2017)
# 
# # check changes 
# full.data %>% select(SLA.all,ALA.all,BBreakDegDay.all,
#                      Npct.all,Cpct.all,Volume,PGsum,CTsum,
#                      survey.event) %>% group_by(survey.event) %>%  summary()

# Remove trees that were surveyed for 0 minites (or NA)
filtered.data <- full.data %>% filter(Min.per.Tree > 0)
## Which observations were removed?
removed.rows <- full.data %>% 
  filter(!Unique.ID %in% filtered.data$Unique.ID) %>%
  select(Unique.ID,SerialNo,Min.per.Tree)
## which trees were removed and how many?
removed.trees <- unique(removed.rows$SerialNo)

Save Objects

# rm(list = c("i","new","old")) # remove un-needed objects

save(list = c("insect.surveys",
              "metadata.insects",
              "trait.data",
              "weather.data",
              "grouped.insects.data",
              "full.data",
              "common.insects",
              "rare.insects",
              "all.insects",
              "filtered.data"),
     file = "data/r-objects.RData",
     compress = FALSE)

WisAsp Insect Community Analysis Object Prep