library(tidyverse)
library(lubridate)
library(car)
dir.create("outputs/106", recursive = TRUE, showWarnings = FALSE)

Do analysis

load RoSA data

rosa_data <- read_rds("./data/106-TrinityRiverHatchery-RoSA-meta-spawndate.rds")

mean spawning dates of the all RoSA genotypes in Trinity River Hatchery

rosa_data %>%
  group_by(rosa) %>%
  mutate(spawn_date = yday(COLLECTION_DATE)) %>%
  summarise(
    mean_sp = round(mean(spawn_date), 2),
    sd_spawn = round(sd(spawn_date), 2),
    n_fish = n()
  )

All fish were spawned in

Start ANOVA analysis

rosa_stats <- rosa_data %>%
  filter(rosa %in% c("EEEEEEEE", "HHHHHHHH", "LLLLLLLL")) %>%
  mutate(
    spawn_date = yday(COLLECTION_DATE),
    rosa_f = factor(rosa, levels = c("EEEEEEEE", "HHHHHHHH", "LLLLLLLL"))
  )
lm1 <- lm(spawn_date ~ rosa_f, rosa_stats)
anova(lm1)

summary(lm1)

## 
## Call:
## lm(formula = spawn_date ~ rosa_f, data = rosa_stats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.561 -11.059  -1.375  12.625  35.439 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     267.375      1.935 138.162  < 2e-16 ***
## rosa_fHHHHHHHH    9.684      3.149   3.076  0.00242 ** 
## rosa_fLLLLLLLL   44.186      2.426  18.214  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.48 on 185 degrees of freedom
## Multiple R-squared:  0.6708, Adjusted R-squared:  0.6672 
## F-statistic: 188.4 on 2 and 185 DF,  p-value: < 2.2e-16

check model assumptions

plot(lm1)

e1 <- resid(lm1)
hist(e1, main = "Histogram of residuals") # not pretty, but acceptable in my opinion.

What about differences among genotypes variance?

rosa_stats %>%
  group_by(rosa_f) %>%
  summarise(
    spawn_mean = mean(spawn_date),
    spawn_sd = sd(spawn_date),
    cv_spawn = 100 * round(spawn_sd / spawn_mean, 4),
    n_fish = n()
  )

leveneTest(rosa_stats$spawn_date ~ rosa_stats$rosa_f) # violation of homoscedasticity

Explore influence of homoscedasticity on ANOVA result

Simulations to determine how big an issue the homoscedasticity violation is on the ANOVA result. The standard deviation among EE and EL RoSA genotypes is very similar and probably isnt significantly different. Here I’ll investigate the influence of differing SD between EL and LL on the distribution of P values.

nSims <- 10000
h0 <- numeric(nSims)
for (i in 1:nSims) {
  x <- rnorm(n = 100, mean = 277, sd = 9.6) # represents mean and SD of HHHHHHHH RoSA
  y <- rnorm(n = 100, mean = 311, sd = 17.3) # represents mean and SD of LLLLLLLL RoSA
  z <- t.test(x, y, var.equal = T)
  h0[i] <- z$p.value
}
hist(h0, main = "Histogram of p-values with observed mean and variance, = n", xlab = ("Observed p-value"), breaks = 100)

# now lets say that the HHHHHHHH RoSA had equivalent SD to EEEEEEEE (increasing SD of HHHHHHHH higher than observed)
nSims <- 10000
h0 <- numeric(nSims)
for (i in 1:nSims) {
  x <- rnorm(n = 100, mean = 277, sd = 17.3) # represents mean and SD of HHHHHHHH RoSA
  y <- rnorm(n = 100, mean = 311, sd = 17.3) # represents mean and inflated SD of LLLLLLLL RoSA
  z <- t.test(x, y, var.equal = T)
  h0[i] <- z$p.value
}
hist(h0, main = "Histogram of p-values with observed mean and inflated H variance, = n", xlab = ("Observed p-value"), breaks = 100)

With equivalent samples sizes among EL and LL the influence of differing SD among genotypes does not influence the distribution of P-values. All comparisons would be statistically significant regardless of the observed differences in SD among genotypes.

Now lets see what happens when differences in sample size and SD are considered at the same time.

nSims <- 10000
h0 <- numeric(nSims)
for (i in 1:nSims) {
  x <- rnorm(n = 100, mean = 277, sd = 9.6) # represents mean and SD of HHHHHHHH RoSA, but same n as LLLLLLLL
  y <- rnorm(n = 100, mean = 311, sd = 17.3) # represents mean and SD of LLLLLLLL RoSA
  z <- t.test(x, y, var.equal = T)
  h0[i] <- z$p.value
}
hist(h0, main = "Histogram of p-values with observed mean and variance, = n", xlab = ("Observed p-value"), breaks = 100)

# now lets say that the HHHHHHHH RoSA had equivalent SD to EEEEEEEE (increasing SD of HHHHHHHH higher than observed)
nSims <- 10000
h0 <- numeric(nSims)
for (i in 1:nSims) {
  x <- rnorm(n = 30, mean = 277, sd = 9.6) # represents mean and SD of HHHHHHHH RoSA
  y <- rnorm(n = 100, mean = 311, sd = 17.3) # represents mean and inflated SD of LLLLLLLL RoSA
  z <- t.test(x, y, var.equal = T)
  h0[i] <- z$p.value
}
hist(h0, main = "Histogram of p-values with observed mean, variance and unequal n", xlab = ("Observed p-value"), breaks = 100)

Even with the difference in SD and sample size it doesn’t appear that those differences have a strong effect on analysing the difference among RoSA EL and LL spawn timing.

Calculating gonadosomatic index from Hearsey’s thesis data. Trinity River hatchery in 2009 and 2010.

Data from appendix G in Hearsey 2011

gonadosomatic index = drained gonad weight / (total mass - drained gonad weight)

gsi_dat <- read.csv("./data/106-hearsey_thesis_appendixG_GSI_spawn_data.csv", strip.white = TRUE, sep = ",")

gsi_dat <- gsi_dat %>%
  rename(
    id = 1,
    Date = 2,
    Run = 3,
    Sex = 4,
    Total_Mass_kg = 5,
    FL_mm = 6,
    Gonad_gm = 7,
    Drain_gonad_gm = 8,
    NWF = 9,
    Fecundity = 10
  )

gsi_dat <- gsi_dat %>%
  mutate(
    drained_gonad = Drain_gonad_gm / 1000,
    gonadSI = drained_gonad / (Total_Mass_kg - drained_gonad),
    sample_date = as.Date(Date, format = "%m/%d/%Y"),
    spawn_year = year(sample_date)
  )

Make a summary table for each year

gsi_summary_table <- gsi_dat %>%
  rename(Sex = 4) %>%
  group_by(spawn_year, Run, Sex) %>%
  summarise(
    GSI_spawn_mean = round(mean(gonadSI), 3),
    GSI_spawn_SD = round(sd(gonadSI), 3),
    GSI_spawn_min = round(min(gonadSI), 3),
    GSI_spawn_max = round(max(gonadSI), 3),
    GSI_spawn_n = round(n(), 3)
  )
gsi_summary_table

make the supplemental figure.

fig_dat <- gsi_dat %>%
  rename(ID = 1) %>%
  dplyr::select(ID, sample_date, gonadSI, Run) %>%
  mutate(
    cday = yday(sample_date),
    cyear = year(sample_date)
  ) %>%
  filter(!is.na(gonadSI))
est_rosa <- read_rds("./data/101-RoSA-klamath-estuary-samples.rds")
est_meta <- read_csv("./data/102-Klamath-entryDATA_withNMFSID.csv")

## Warning: Missing column names filled in: 'X16' [16]

est_gsi <- read_rds("./outputs/101/RoSA-klamath-estuary-rubias-assignments.rds") %>%
  mutate(Indiv = gsub("chinook", "CH", indiv))
est_gt_meta <- left_join(est_rosa, est_meta, by = c("NMFS_DNA_ID" = "NMFS ID"))
est_final <- est_gsi %>%
  dplyr::select(Indiv, repunit) %>%
  left_join(est_gt_meta, ., "Indiv") %>%
  filter(repunit == "TrinityRiver") %>%
  mutate(gonadSI = gsi) %>%
  filter(sex == "f") %>%
  dplyr::select(Indiv, julian, year, gonadSI, rosa) %>%
  rename(ID = Indiv, cday = julian, cyear = year, Run = rosa)
fig2plot <- fig_dat %>%
  dplyr::select(Run, cday, cyear, gonadSI) %>%
  mutate(Run = paste0(str_trim(Run), "_TRH")) %>%
  bind_rows(., est_final)
fig2plot %>%
  filter(!str_detect(Run, "\\?")) %>%
  mutate(Run = factor(Run, levels = c("EEEEEEEE", "HHHHHHHH", "LLLLLLLL", "spring_TRH", "fall_TRH"))) %>%
  ggplot(., aes(x = cday, y = gonadSI, colour = Run)) +
  geom_point(alpha = 0.75) +
  xlab(label = "Estuary sampling date") +
  ylab(label = "Gonadosomatic Index") +
  scale_fill_discrete(name = "", labels = c("EE", "EL", "LL", "spring_TRH", "fall_TRH")) +
  scale_colour_manual(name = "", values = c("gold", "tan2", "blue", "gold4", "blue4"), labels = c("EE", "EL", "LL", "spring_TRH", "fall_TRH")) +
  theme_bw() +
  scale_x_continuous(limits = c(121, 335), breaks = c(121, 152, 182, 213, 244, 274, 305, 335), labels = c("1 May", "1 Jun", "1 Jul", "1 Aug", "1 Sep", "1 Oct", "1 Nov", "1 Dec")) +
  scale_y_continuous(breaks = c(0, 0.05, 0.10, 0.15, 0.20, 0.25), labels = c(0, 0.05, 0.10, 0.15, 0.20, 0.25), limits = c(0, 0.275)) +
  theme(
    axis.text = element_text(size = 12, family = "serif"),
    axis.title = element_text(size = 12, family = "serif"), axis.text.x = element_text(angle = 45, hjust = 1, family = "serif"),
    strip.background = element_blank(),
    panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
    axis.title.x = element_blank(), strip.text.x = element_text(size = 14, family = "serif"), plot.margin = unit(c(0, .2, 0.5, 0.2), "lines")
  ) +
  facet_grid(. ~ cyear)

ggsave(filename = "outputs/106/trh-gsi-plot.pdf", width = 6, height = 4)

make figure of spawning date by genotype in Trinity River hatchery

rosa_data %>%
  mutate(coll_day = yday(COLLECTION_DATE)) %>%
  filter(rosa %in% c("EEEEEEEE", "HHHHHHHH", "LLLLLLLL")) %>%
  ggplot(., aes(x = coll_day, fill = rosa)) +
  geom_bar(stat = "count", width = 1, position = "dodge") +
  xlab(label = "Spawning date") +
  ylab(label = "Count") +
  theme_bw() +
  facet_grid(rosa ~ .) +
  theme(
    axis.text = element_text(size = 12, family = "serif"),
    axis.title = element_text(size = 14, family = "serif"),
    axis.text.x = element_text(angle = 45, hjust = 1, family = "serif"),
    strip.background = element_blank(),
    legend.position = "top",
    panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
    strip.text.x = element_text(size = 14, family = "serif"),
    strip.text.y = element_blank()
  ) +
  scale_x_continuous(
    limits = c(244, 365), breaks = c(245, 275, 306, 335, 365),
    labels = c("1 Sep", "1 Oct", "1 Nov", "1 Dec", "1 Jan")
  ) +
  scale_fill_manual(
    values = c("gold", "tan2", "blue"),
    breaks = c("EEEEEEEE", "HHHHHHHH", "LLLLLLLL"),
    labels = c("EE", "EL", "LL"),
    name = "RoSA genotype"
  ) +
  geom_rect(
    inherit.aes = FALSE, aes(xmin = 285, xmax = 297, ymin = 0, ymax = Inf),
    fill = "grey85"
  ) # draw spawning hiatus period

ggsave(filename = "outputs/106/trh-spawn-date-plot.pdf", width = 6, height = 4)

Session Info

sessioninfo::session_info()

## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value                       
##  version  R version 3.6.2 (2019-12-12)
##  os       macOS Sierra 10.12.6        
##  system   x86_64, darwin15.6.0        
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       America/Denver              
##  date     2020-05-14                  
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package     * version date       lib source        
##  abind         1.4-5   2016-07-21 [1] CRAN (R 3.6.0)
##  assertthat    0.2.1   2019-03-21 [1] CRAN (R 3.6.0)
##  backports     1.1.6   2020-04-05 [1] CRAN (R 3.6.2)
##  broom         0.5.6   2020-04-20 [1] CRAN (R 3.6.2)
##  car         * 3.0-7   2020-03-11 [1] CRAN (R 3.6.0)
##  carData     * 3.0-3   2019-11-16 [1] CRAN (R 3.6.0)
##  cellranger    1.1.0   2016-07-27 [1] CRAN (R 3.6.0)
##  cli           2.0.2   2020-02-28 [1] CRAN (R 3.6.0)
##  colorspace    1.4-1   2019-03-18 [1] CRAN (R 3.6.0)
##  crayon        1.3.4   2017-09-16 [1] CRAN (R 3.6.0)
##  curl          4.3     2019-12-02 [1] CRAN (R 3.6.0)
##  data.table    1.12.8  2019-12-09 [1] CRAN (R 3.6.0)
##  DBI           1.1.0   2019-12-15 [1] CRAN (R 3.6.0)
##  dbplyr        1.4.3   2020-04-19 [1] CRAN (R 3.6.2)
##  digest        0.6.25  2020-02-23 [1] CRAN (R 3.6.0)
##  dplyr       * 0.8.5   2020-03-07 [1] CRAN (R 3.6.0)
##  ellipsis      0.3.0   2019-09-20 [1] CRAN (R 3.6.0)
##  evaluate      0.14    2019-05-28 [1] CRAN (R 3.6.0)
##  fansi         0.4.1   2020-01-08 [1] CRAN (R 3.6.0)
##  farver        2.0.3   2020-01-16 [1] CRAN (R 3.6.0)
##  forcats     * 0.5.0   2020-03-01 [1] CRAN (R 3.6.0)
##  foreign       0.8-72  2019-08-02 [2] CRAN (R 3.6.2)
##  fs            1.4.1   2020-04-04 [1] CRAN (R 3.6.2)
##  generics      0.0.2   2018-11-29 [1] CRAN (R 3.6.0)
##  ggplot2     * 3.3.0   2020-03-05 [1] CRAN (R 3.6.0)
##  glue          1.4.0   2020-04-03 [1] CRAN (R 3.6.2)
##  gtable        0.3.0   2019-03-25 [1] CRAN (R 3.6.0)
##  haven         2.2.0   2019-11-08 [1] CRAN (R 3.6.0)
##  hms           0.5.3   2020-01-08 [1] CRAN (R 3.6.0)
##  htmltools     0.4.0   2019-10-04 [1] CRAN (R 3.6.0)
##  httr          1.4.1   2019-08-05 [1] CRAN (R 3.6.0)
##  jsonlite      1.6.1   2020-02-02 [1] CRAN (R 3.6.0)
##  knitr         1.28    2020-02-06 [1] CRAN (R 3.6.0)
##  labeling      0.3     2014-08-23 [1] CRAN (R 3.6.0)
##  lattice       0.20-38 2018-11-04 [2] CRAN (R 3.6.2)
##  lifecycle     0.2.0   2020-03-06 [1] CRAN (R 3.6.0)
##  lubridate   * 1.7.8   2020-04-06 [1] CRAN (R 3.6.2)
##  magrittr      1.5     2014-11-22 [1] CRAN (R 3.6.0)
##  modelr        0.1.6   2020-02-22 [1] CRAN (R 3.6.0)
##  munsell       0.5.0   2018-06-12 [1] CRAN (R 3.6.0)
##  nlme          3.1-142 2019-11-07 [2] CRAN (R 3.6.2)
##  openxlsx      4.1.4   2019-12-06 [1] CRAN (R 3.6.0)
##  pillar        1.4.3   2019-12-20 [1] CRAN (R 3.6.0)
##  pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 3.6.0)
##  purrr       * 0.3.4   2020-04-17 [1] CRAN (R 3.6.2)
##  R6            2.4.1   2019-11-12 [1] CRAN (R 3.6.0)
##  Rcpp          1.0.4   2020-03-17 [1] CRAN (R 3.6.0)
##  readr       * 1.3.1   2018-12-21 [1] CRAN (R 3.6.0)
##  readxl        1.3.1   2019-03-13 [1] CRAN (R 3.6.0)
##  reprex        0.3.0   2019-05-16 [1] CRAN (R 3.6.0)
##  rio           0.5.16  2018-11-26 [1] CRAN (R 3.6.0)
##  rlang         0.4.5   2020-03-01 [1] CRAN (R 3.6.0)
##  rmarkdown     2.1     2020-01-20 [1] CRAN (R 3.6.0)
##  rstudioapi    0.11    2020-02-07 [1] CRAN (R 3.6.0)
##  rvest         0.3.5   2019-11-08 [1] CRAN (R 3.6.0)
##  scales        1.1.0   2019-11-18 [1] CRAN (R 3.6.0)
##  sessioninfo   1.1.1   2018-11-05 [1] CRAN (R 3.6.0)
##  stringi       1.4.6   2020-02-17 [1] CRAN (R 3.6.0)
##  stringr     * 1.4.0   2019-02-10 [1] CRAN (R 3.6.0)
##  tibble      * 3.0.1   2020-04-20 [1] CRAN (R 3.6.2)
##  tidyr       * 1.0.2   2020-01-24 [1] CRAN (R 3.6.0)
##  tidyselect    1.0.0   2020-01-27 [1] CRAN (R 3.6.0)
##  tidyverse   * 1.3.0   2019-11-21 [1] CRAN (R 3.6.0)
##  vctrs         0.2.4   2020-03-10 [1] CRAN (R 3.6.0)
##  withr         2.2.0   2020-04-20 [1] CRAN (R 3.6.2)
##  xfun          0.13    2020-04-13 [1] CRAN (R 3.6.2)
##  xml2          1.3.2   2020-04-23 [1] CRAN (R 3.6.2)
##  yaml          2.2.1   2020-02-01 [1] CRAN (R 3.6.0)
##  zip           2.0.4   2019-09-01 [1] CRAN (R 3.6.0)
## 
## [1] /Users/eriq/Library/R/3.6/library
## [2] /Library/Frameworks/R.framework/Versions/3.6/Resources/library

Running Time

Running the code and rendering this notebook required approximately this much time on a Mac laptop of middling speed:

Sys.time() - start_time

## Time difference of 8.781956 secs

106-TrinityRiver-ANOVA-etc

Neil Thompson