Merge pull request #149 from terraref/gh121_synthesis_update

kimberlyh66 · web-flow · commit c8d9daed7040 · 2019-06-26T12:44:07.000-07:00
[WIP] begin updating synthesis vignette
diff --git a/vignettes/04-synthesis-data.Rmd b/vignettes/04-synthesis-data.Rmd
@@ -9,34 +9,34 @@ The second analysis compares greenness from image data with canopy cover.
 ## Get and join data
 
 Here we combine two dataframes. 
-The first contains all the canopy height values for 2017, which was created in the traits vignette. 
-The second is the cumulative growing degree days for all of 2017, which were calculated from the daily minimum and maximum temperatures in the weather vignette. 
+The first contains all the canopy cover values for 2018, which was created in the traits vignette. 
+The second is the cumulative growing degree days for all of 2018, which were calculated from the daily minimum and maximum temperatures in the weather vignette. 
 They are combined by their common column, the date. 
 
-```{r synth_setup}
+```{r synth_setup, message=FALSE}
 library(dplyr)
 library(ggplot2)
 library(jsonlite)
 library(lubridate)
 library(traits)
-library(inflection)
 library(sf)
 library(stringr)
 options(betydb_url = "https://terraref.ncsa.illinois.edu/bety/",
-        betydb_api_version = 'v1')
+        betydb_api_version = 'beta', 
+        betydb_key = '9999999999999999999999999999999999999999')
 ```
 
 ```{r get_trait_data, message = FALSE}
-trait_canopy_height <- betydb_query(table     = "search", 
-                      trait     = "canopy_height", 
-                      date      = "~2017",
+trait_canopy_cover <- betydb_query(table     = "search", 
+                      trait     = "canopy_cover", 
+                      date      = "~2018",
                       limit     =  "none")
-trait_canopy_height_day = trait_canopy_height %>% 
+trait_canopy_cover_day = trait_canopy_cover %>% 
   mutate(day = as.Date(raw_date))
 ```
 
 ```{r get_weather_data}
-weather <- fromJSON('https://terraref.ncsa.illinois.edu/clowder/api/geostreams/datapoints?stream_id=46431&since=2017-01-01&until=2017-12-31', flatten = FALSE)
+weather <- fromJSON('https://terraref.ncsa.illinois.edu/clowder/api/geostreams/datapoints?stream_id=46431&since=2018-01-01&until=2018-12-31', flatten = FALSE)
 weather <- weather$properties %>% 
   mutate(time = ymd_hms(weather$end_time))
 daily_values = weather %>% 
@@ -52,7 +52,7 @@ daily_values <- daily_values %>%
 ```
 
 ```{r combine_trait_weather}
-trait_weather_df <- full_join(trait_canopy_height_day, daily_values, by = "day") %>% 
+trait_weather_df <- full_join(trait_canopy_cover_day, daily_values, by = "day") %>% 
   select(day, cultivar, mean, gdd_cum) %>% 
   na.omit()
 ```
@@ -61,22 +61,60 @@ trait_weather_df <- full_join(trait_canopy_height_day, daily_values, by = "day")
 
 We are interested in how growing degree days affects canopy cover. 
 To investigate this, we are going to model and plot their relationship. 
+We are using a logistic growth model here because it is appropriate for the shape of the GDD-cover relationship.
+
+The logistic growth model is specified as 
+
+$$y = \frac{c}{1+e^{a + b * \textrm{x}}}$$
+
+where $y$ is the response variable canopy cover, $x$ is the predictor growing degree days, $c$ is the asymptote or maximum canopy cover, $a$ is the initial value for canopy cover, and $b$ is the steepness of the curve. (reference)
+
 We want to know the relationship for each cultivar, so we'll start of by determining the parameters of the model for one of the cultivars in our dataset. 
-We are using a logistic growth model here because it is appropriate for the shape of the GDD-cover relationship. 
+We provide estimated values for the asymptote $c$ and initial canopy cover value $a$, and provide canopy cover $y$ with corresponding growing degree days $x$ for one measurement of the chosen cultivar. 
+
+The below provides better estimates for the $c$, $a$, and $b$ parameters, which are used to plot the model as an orange line on top of the black points which are actual values. 
 
 ```{r model_get_parameters}
 single_cultivar <- trait_weather_df %>% 
   filter(cultivar == "PI656026")
-cap     <- 150
-initial <- 25
-mean    <- single_cultivar$mean[15]
-gdd_cum <- single_cultivar$gdd_cum[15]
-rate    <- ((log((cap/mean) - 1)) - initial)/gdd_cum
-model_single_cultivar <- nls(mean ~ cap / (1 + exp(initial + rate * gdd_cum)), 
-                             start = list(cap = cap, initial = initial, rate = rate),
-                             data = single_cultivar, trace = TRUE)
+
+c <- 90
+a <- 0.1
+y <- single_cultivar$mean[3]
+g <- single_cultivar$gdd_cum[3]
+b <- ((log((c/y) - 1)) - a)/g
+model_single_cultivar <- nls(mean ~ c / (1 + exp(a + b * gdd_cum)), 
+                             start = list(c = c, a = a, b = b),
+                             data = single_cultivar)
+summary(model_single_cultivar)
+coef(model_single_cultivar)
+
+single_c <- coef(model_single_cultivar)[1]
+single_a <- coef(model_single_cultivar)[2]
+single_b <- coef(model_single_cultivar)[3]
+
 single_cultivar <- single_cultivar %>% 
-  mutate(mean_predict = coef(model_single_cultivar)[1] / (1 + exp(coef(model_single_cultivar)[2] + coef(model_single_cultivar)[3] * gdd_cum)))
+  mutate(mean_predict = single_c / (1 + exp(single_a + single_b * gdd_cum)))
+ggplot(single_cultivar) +
+  geom_point(aes(x = gdd_cum, y = mean)) +
+  geom_line(aes(x = gdd_cum, y = mean_predict), color = "orange") +
+  labs(x = "Cumulative growing degree days", y = "Canopy Height")
+```
+
+We then calculate the inflection point for this cultivar's model. 
+
+The maximum growth rate is the change in canopy cover per day at the rate of maximum growth. The growing degree day at which maximum growth is obtained is called the _inflection point_. This occurs near the midpoint of the y-axis, or $\frac{c - a}{2}$.
+
+```{r}
+inf_y <- (as.numeric(single_c) - as.numeric(single_a)) / 2
+inf_x <- ((log((as.numeric(single_c) / inf_y) - 1)) - as.numeric(single_a)) / as.numeric(single_b)
+
+ggplot(single_cultivar) +
+  geom_point(aes(x = gdd_cum, y = mean)) +
+  geom_line(aes(x = gdd_cum, y = mean_predict), color = "orange") +
+  geom_hline(yintercept = inf_y, linetype = "dashed") +
+  geom_vline(xintercept = inf_x) +
+  labs(x = "Cumulative growing degree days", y = "Canopy Height")
 ```
 
 We then use the parameters from a single cultivar to run a model for each of the rest of the cultivars. 
@@ -85,24 +123,31 @@ We also calculated the inflection point from each cultivar's model, which will b
 
 ```{r model_all_cultivars}
 all_cultivars <- c(day = as.double(), cultivar = as.character(), mean = as.numeric(), 
-                   gdd_cum = as.numeric(), mean_predict = as.numeric())
+                   gdd_cum = as.numeric(), mean_predict = as.numeric(), 
+                   inf_y = as.numeric(), inf_x = as.numeric())
+
 for(each_cultivar in unique(trait_weather_df$cultivar)){
   each_cultivar_df <- filter(trait_weather_df, cultivar == each_cultivar)
-  each_cultivar_model <- nls(mean ~ cap / (1 + exp(initial + rate * gdd_cum)), 
-                            start = list(cap = cap, initial = initial, rate = rate),
-                            data = each_cultivar_df)
-  model_cap     <- coef(each_cultivar_model)[1]
-  model_initial <- coef(each_cultivar_model)[2]
-  model_rate    <- coef(each_cultivar_model)[3]
+  each_cultivar_model <- nls(mean ~ c / (1 + exp(a + b * gdd_cum)), 
+                             start = list(c = c, a = a, b = b), 
+                             data = each_cultivar_df)
+  model_c <- coef(each_cultivar_model)[1]
+  model_a <- coef(each_cultivar_model)[2]
+  model_b <- coef(each_cultivar_model)[3]
   each_cultivar_df <- each_cultivar_df %>% 
-      mutate(mean_predict = model_cap / (1 + exp(model_initial + model_rate * gdd_cum)), 
-             inf_point = ((log((model_cap / 100) - 1)) - model_initial) / model_rate)
+    mutate(mean_predict = model_c / (1 + exp(model_a + model_b * gdd_cum)), 
+           inf_y = (as.numeric(model_c) - as.numeric(model_a)) / 2, 
+           inf_x = ((log((as.numeric(model_c) / inf_y) - 1)) - 
+                      as.numeric(single_a)) / as.numeric(single_b))
   all_cultivars <- rbind(each_cultivar_df, all_cultivars)
 }
+
 ggplot(all_cultivars) +
   geom_point(aes(x = gdd_cum, y = mean)) +
   geom_line(aes(x = gdd_cum, y = mean_predict), color = "orange") +
   facet_wrap(~cultivar, scales = "free_y") +
+  geom_hline(yintercept = inf_y, linetype = "dashed") +
+  geom_vline(xintercept = inf_x) +
   labs(x = "Cumulative growing degree days", y = "Canopy Height")
 ```
 
@@ -112,15 +157,9 @@ The last thing that we are going to do is assess the difference in this relation
 We are going to use the inflection point from the logistic growth model, which indicates when canopy cover stops increasing as quickly with increasingly more warm days. 
 The resulting inflection points for each cultivar are plotted as a histogram. 
 
-```{r plot_inflections}
-ggplot(all_cultivars) +
-  geom_point(aes(x = gdd_cum, y = mean)) +
-  geom_line(aes(x = gdd_cum, y = mean_predict), color = "orange") +
-  geom_vline(aes(xintercept = inf_point)) +
-  facet_wrap(~cultivar, scales = "free_y") +
-  labs(x = "Cumulative growing degree days", y = "Canopy Height")
-ggplot(data.frame(inf_points = unique(all_cultivars$inf_point))) +
-  geom_histogram(aes(x = inf_points)) +
+```{r plot_inflections, warning=FALSE}
+ggplot(data.frame(inf_points = unique(all_cultivars$inf_x))) +
+  geom_histogram(aes(x = inf_points), bins = 300) +
   xlim(min(all_cultivars$gdd_cum), max(all_cultivars$gdd_cum)) +
   labs(x = "Inflection points", y = "Number")
 ```
@@ -129,15 +168,12 @@ ggplot(data.frame(inf_points = unique(all_cultivars$inf_point))) +
 
 In this examnple we will extract our plot data from a series of images taken in May of Season 6, measure its "greeness" annd plot that against the plant heights from above in this vignette.
 
-The chosen statistic here is the normalised green-red difference index, NGRDI=(R-G)/(R+G) (Rasmussen et al., 2016), which uses the red and green bands from the image raster.
+The chosen statistic here is the normalised green-red difference index, $\textrm{NGRDI}=\frac{R-G}/{R+G}$ (Rasmussen et al., 2016), which uses the red and green bands from the image raster.
 
 Below we retrieve all the available plots for a particular date, then find and convert the plot boundary JSON into tuples.
 We will use these tuples to extract the data for our plot.
 
 ```{r get_plot_boundary}
-# Setting up our options
-options(betydb_url = "https://terraref.ncsa.illinois.edu/bety/",
-        betydb_api_version = 'v1')
 
 # Making the query for our site
 sites <- betydb_query(table     = "sites",  
@@ -154,43 +190,26 @@ site.clip <- as(site.poly,"Spatial")
 
 These are the names of the full field RGB data for the month of May.
 We will be extracting our plot data from these files.
-A compressed file containing these images can be found on [Google Drive](https://drive.google.com/file/d/1UuVHHcyf9sxjX9fEUpD4qa9LGlBR0XnK/view?usp=sharing).
-Be sure to extract the image files into a folder that's accessible to the code below.
+A compressed file containing these images can be found on [Clowder](https://terraref.ncsa.illinois.edu/clowder/files/5c8175874f0c78f6486d6870?dataset=5c81709a4f0c78f6486d686c&space=).
+The code below downloads the image files into a .zip file, which takes a few minutes, and then unzips that file so the image files are accessible. 
 
 ```{r synth_filename_array}
-image_files <- 
-   c('fullfield_L1_ua-mac_2018-05-01_rgb_stereovis_ir_sensors_fullfield_sorghum6_shade_may2018_thumb.tif',
-     'fullfield_L1_ua-mac_2018-05-02_rgb_stereovis_ir_sensors_fullfield_sorghum6_shade_may2018_thumb.tif',
-     'fullfield_L1_ua-mac_2018-05-03_rgb_stereovis_ir_sensors_fullfield_sorghum6_shade_may2018_thumb.tif',
-     'fullfield_L1_ua-mac_2018-05-05_rgb_stereovis_ir_sensors_fullfield_sorghum6_shade_may2018_thumb.tif',
-     'fullfield_L1_ua-mac_2018-05-06_rgb_stereovis_ir_sensors_fullfield_sorghum6_shade_may2018_thumb.tif',
-     'fullfield_L1_ua-mac_2018-05-08_rgb_stereovis_ir_sensors_fullfield_sorghum6_shade_may2018_thumb.tif',
-     'fullfield_L1_ua-mac_2018-05-09_rgb_stereovis_ir_sensors_fullfield_sorghum6_shade_may2018_thumb.tif',
-     'fullfield_L1_ua-mac_2018-05-10_rgb_stereovis_ir_sensors_fullfield_sorghum6_sun_may2018_-_copy_thumb.tif',
-     'fullfield_L1_ua-mac_2018-05-12_rgb_stereovis_ir_sensors_fullfield_sorghum6_shade_may2018_thumb.tif',
-     'fullfield_L1_ua-mac_2018-05-13_rgb_stereovis_ir_sensors_fullfield_sorghum6_shade_may2018_thumb.tif',
-     'fullfield_L1_ua-mac_2018-05-14_rgb_stereovis_ir_sensors_fullfield_sorghum6_shade_may2018_thumb.tif',
-     'fullfield_L1_ua-mac_2018-05-15_rgb_stereovis_ir_sensors_fullfield_sorghum6_sun_may2018_thumb.tif',
-     'fullfield_L1_ua-mac_2018-05-17_rgb_stereovis_ir_sensors_fullfield_sorghum6_shade_may2018_thumb.tif',
-     'fullfield_L1_ua-mac_2018-05-18_rgb_stereovis_ir_sensors_fullfield_sorghum6_sun_may2018_thumb.tif',
-     'fullfield_L1_ua-mac_2018-05-20_rgb_stereovis_ir_sensors_plots_sorghum6_shade_thumb.tif',
-     'fullfield_L1_ua-mac_2018-05-21_rgb_stereovis_ir_sensors_fullfield_sorghum6_shade_may2018_thumb.tif',
-     'fullfield_L1_ua-mac_2018-05-22_rgb_stereovis_ir_sensors_plots_sorghum6_sun_thumb.tif',
-     'fullfield_L1_ua-mac_2018-05-23_rgb_stereovis_ir_sensors_plots_sorghum6_sun_thumb.tif',
-     'fullfield_L1_ua-mac_2018-05-28_rgb_stereovis_ir_sensors_plots_sorghum6_shade_rgb_eastedge_mn_thumb.tif'
-     )
-```
-
-```{r, echo = F}
-image_files_paths <- file.path("vignettes/", image_files)
+if(!file.exists("rgb_images.zip")){
+  download.file("https://terraref.ncsa.illinois.edu/clowder/files/5c8175874f0c78f6486d6870/blob", destfile = "rgb_images.zip")
+  unzip("rgb_images.zip", exdir = ".")
+}
 ```
 
 We will loop through these images, extract our plot data, and calculate the "greeness" of each extract.
 We are using the name of the file to extract the date for later.
 
-```{r synth_get_greeness}
+```{r synth_get_greeness, message=FALSE}
 library(raster)
 
+# Get file paths for all image files
+image_files <- list.files(".", pattern = "*.tif")
+image_files_paths <- file.path(".", image_files)
+
 # Extract the date from the file name
 getDate <- function(file_name){
   date <- str_match_all(file_name, '[0-9]{4}-[0-9]{2}-[0-9]{2}')[[1]][,1]
@@ -232,9 +251,9 @@ We then pull in the canopy data for our charting purposes.
 
 ```{r get_trait_data_2, message = FALSE}
 trait_canopy_cover <- betydb_query(table     = "search", 
-                       trait     = "canopy_cover", 
-                       date      = "~2018 May",
-                       limit     =  "none")
+                                   trait     = "canopy_cover", 
+                                   date      = "~2018 May",
+                                   limit     =  "none")
                        
 trait_canopy_cover_day <- trait_canopy_cover %>% 
   mutate(day = as.Date(raw_date))
@@ -245,15 +264,14 @@ We now need to add the height data to the data set to plot.
 We then determine the average canopy cover across the site for the day that the sensor data were collected. 
 The relationship between our greenness metric and average canopy cover are plotted. 
 
-```{r plot_sensor_trait}
+```{r plot_sensor_trait, warning=FALSE}
 trait_canopy_cover_daily <- trait_canopy_cover_day %>% 
   filter(day %in% greenness_df$day) %>% 
   group_by(day) %>% 
   summarise(mean_canopy_cover = mean(mean), 
             sd_canopy_cover = sd(mean))
 sensor_trait_df <- left_join(trait_canopy_cover_daily, greenness_df, by = "day")
+
 ggplot(sensor_trait_df, aes(x = mean_canopy_cover, y = greeness)) +
   geom_point()
 ```
-
-