library(tidyverse)
library(lubridate)
library(car)
dir.create("outputs/106", recursive = TRUE, showWarnings = FALSE)
rosa_data <- read_rds("./data/106-TrinityRiverHatchery-RoSA-meta-spawndate.rds")
mean spawning dates of the all RoSA genotypes in Trinity River Hatchery
rosa_data %>%
group_by(rosa) %>%
mutate(spawn_date = yday(COLLECTION_DATE)) %>%
summarise(
mean_sp = round(mean(spawn_date), 2),
sd_spawn = round(sd(spawn_date), 2),
n_fish = n()
)
Start ANOVA analysis
rosa_stats <- rosa_data %>%
filter(rosa %in% c("EEEEEEEE", "HHHHHHHH", "LLLLLLLL")) %>%
mutate(
spawn_date = yday(COLLECTION_DATE),
rosa_f = factor(rosa, levels = c("EEEEEEEE", "HHHHHHHH", "LLLLLLLL"))
)
lm1 <- lm(spawn_date ~ rosa_f, rosa_stats)
anova(lm1)
summary(lm1)
##
## Call:
## lm(formula = spawn_date ~ rosa_f, data = rosa_stats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.561 -11.059 -1.375 12.625 35.439
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 267.375 1.935 138.162 < 2e-16 ***
## rosa_fHHHHHHHH 9.684 3.149 3.076 0.00242 **
## rosa_fLLLLLLLL 44.186 2.426 18.214 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.48 on 185 degrees of freedom
## Multiple R-squared: 0.6708, Adjusted R-squared: 0.6672
## F-statistic: 188.4 on 2 and 185 DF, p-value: < 2.2e-16
check model assumptions
plot(lm1)
e1 <- resid(lm1)
hist(e1, main = "Histogram of residuals") # not pretty, but acceptable in my opinion.
What about differences among genotypes variance?
rosa_stats %>%
group_by(rosa_f) %>%
summarise(
spawn_mean = mean(spawn_date),
spawn_sd = sd(spawn_date),
cv_spawn = 100 * round(spawn_sd / spawn_mean, 4),
n_fish = n()
)
leveneTest(rosa_stats$spawn_date ~ rosa_stats$rosa_f) # violation of homoscedasticity
Simulations to determine how big an issue the homoscedasticity violation is on the ANOVA result. The standard deviation among EE and EL RoSA genotypes is very similar and probably isnt significantly different. Here I’ll investigate the influence of differing SD between EL and LL on the distribution of P values.
nSims <- 10000
h0 <- numeric(nSims)
for (i in 1:nSims) {
x <- rnorm(n = 100, mean = 277, sd = 9.6) # represents mean and SD of HHHHHHHH RoSA
y <- rnorm(n = 100, mean = 311, sd = 17.3) # represents mean and SD of LLLLLLLL RoSA
z <- t.test(x, y, var.equal = T)
h0[i] <- z$p.value
}
hist(h0, main = "Histogram of p-values with observed mean and variance, = n", xlab = ("Observed p-value"), breaks = 100)
# now lets say that the HHHHHHHH RoSA had equivalent SD to EEEEEEEE (increasing SD of HHHHHHHH higher than observed)
nSims <- 10000
h0 <- numeric(nSims)
for (i in 1:nSims) {
x <- rnorm(n = 100, mean = 277, sd = 17.3) # represents mean and SD of HHHHHHHH RoSA
y <- rnorm(n = 100, mean = 311, sd = 17.3) # represents mean and inflated SD of LLLLLLLL RoSA
z <- t.test(x, y, var.equal = T)
h0[i] <- z$p.value
}
hist(h0, main = "Histogram of p-values with observed mean and inflated H variance, = n", xlab = ("Observed p-value"), breaks = 100)
With equivalent samples sizes among EL and LL the influence of differing SD among genotypes does not influence the distribution of P-values. All comparisons would be statistically significant regardless of the observed differences in SD among genotypes.
Now lets see what happens when differences in sample size and SD are considered at the same time.
nSims <- 10000
h0 <- numeric(nSims)
for (i in 1:nSims) {
x <- rnorm(n = 100, mean = 277, sd = 9.6) # represents mean and SD of HHHHHHHH RoSA, but same n as LLLLLLLL
y <- rnorm(n = 100, mean = 311, sd = 17.3) # represents mean and SD of LLLLLLLL RoSA
z <- t.test(x, y, var.equal = T)
h0[i] <- z$p.value
}
hist(h0, main = "Histogram of p-values with observed mean and variance, = n", xlab = ("Observed p-value"), breaks = 100)
# now lets say that the HHHHHHHH RoSA had equivalent SD to EEEEEEEE (increasing SD of HHHHHHHH higher than observed)
nSims <- 10000
h0 <- numeric(nSims)
for (i in 1:nSims) {
x <- rnorm(n = 30, mean = 277, sd = 9.6) # represents mean and SD of HHHHHHHH RoSA
y <- rnorm(n = 100, mean = 311, sd = 17.3) # represents mean and inflated SD of LLLLLLLL RoSA
z <- t.test(x, y, var.equal = T)
h0[i] <- z$p.value
}
hist(h0, main = "Histogram of p-values with observed mean, variance and unequal n", xlab = ("Observed p-value"), breaks = 100)
Even with the difference in SD and sample size it doesn’t appear that those differences have a strong effect on analysing the difference among RoSA EL and LL spawn timing.
Data from appendix G in Hearsey 2011
gonadosomatic index = drained gonad weight / (total mass - drained gonad weight)
gsi_dat <- read.csv("./data/106-hearsey_thesis_appendixG_GSI_spawn_data.csv", strip.white = TRUE, sep = ",")
gsi_dat <- gsi_dat %>%
rename(
id = 1,
Date = 2,
Run = 3,
Sex = 4,
Total_Mass_kg = 5,
FL_mm = 6,
Gonad_gm = 7,
Drain_gonad_gm = 8,
NWF = 9,
Fecundity = 10
)
gsi_dat <- gsi_dat %>%
mutate(
drained_gonad = Drain_gonad_gm / 1000,
gonadSI = drained_gonad / (Total_Mass_kg - drained_gonad),
sample_date = as.Date(Date, format = "%m/%d/%Y"),
spawn_year = year(sample_date)
)
Make a summary table for each year
gsi_summary_table <- gsi_dat %>%
rename(Sex = 4) %>%
group_by(spawn_year, Run, Sex) %>%
summarise(
GSI_spawn_mean = round(mean(gonadSI), 3),
GSI_spawn_SD = round(sd(gonadSI), 3),
GSI_spawn_min = round(min(gonadSI), 3),
GSI_spawn_max = round(max(gonadSI), 3),
GSI_spawn_n = round(n(), 3)
)
gsi_summary_table
make the supplemental figure.
fig_dat <- gsi_dat %>%
rename(ID = 1) %>%
dplyr::select(ID, sample_date, gonadSI, Run) %>%
mutate(
cday = yday(sample_date),
cyear = year(sample_date)
) %>%
filter(!is.na(gonadSI))
est_rosa <- read_rds("./data/101-RoSA-klamath-estuary-samples.rds")
est_meta <- read_csv("./data/102-Klamath-entryDATA_withNMFSID.csv")
## Warning: Missing column names filled in: 'X16' [16]
est_gsi <- read_rds("./outputs/101/RoSA-klamath-estuary-rubias-assignments.rds") %>%
mutate(Indiv = gsub("chinook", "CH", indiv))
est_gt_meta <- left_join(est_rosa, est_meta, by = c("NMFS_DNA_ID" = "NMFS ID"))
est_final <- est_gsi %>%
dplyr::select(Indiv, repunit) %>%
left_join(est_gt_meta, ., "Indiv") %>%
filter(repunit == "TrinityRiver") %>%
mutate(gonadSI = gsi) %>%
filter(sex == "f") %>%
dplyr::select(Indiv, julian, year, gonadSI, rosa) %>%
rename(ID = Indiv, cday = julian, cyear = year, Run = rosa)
fig2plot <- fig_dat %>%
dplyr::select(Run, cday, cyear, gonadSI) %>%
mutate(Run = paste0(str_trim(Run), "_TRH")) %>%
bind_rows(., est_final)
fig2plot %>%
filter(!str_detect(Run, "\\?")) %>%
mutate(Run = factor(Run, levels = c("EEEEEEEE", "HHHHHHHH", "LLLLLLLL", "spring_TRH", "fall_TRH"))) %>%
ggplot(., aes(x = cday, y = gonadSI, colour = Run)) +
geom_point(alpha = 0.75) +
xlab(label = "Estuary sampling date") +
ylab(label = "Gonadosomatic Index") +
scale_fill_discrete(name = "", labels = c("EE", "EL", "LL", "spring_TRH", "fall_TRH")) +
scale_colour_manual(name = "", values = c("gold", "tan2", "blue", "gold4", "blue4"), labels = c("EE", "EL", "LL", "spring_TRH", "fall_TRH")) +
theme_bw() +
scale_x_continuous(limits = c(121, 335), breaks = c(121, 152, 182, 213, 244, 274, 305, 335), labels = c("1 May", "1 Jun", "1 Jul", "1 Aug", "1 Sep", "1 Oct", "1 Nov", "1 Dec")) +
scale_y_continuous(breaks = c(0, 0.05, 0.10, 0.15, 0.20, 0.25), labels = c(0, 0.05, 0.10, 0.15, 0.20, 0.25), limits = c(0, 0.275)) +
theme(
axis.text = element_text(size = 12, family = "serif"),
axis.title = element_text(size = 12, family = "serif"), axis.text.x = element_text(angle = 45, hjust = 1, family = "serif"),
strip.background = element_blank(),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
axis.title.x = element_blank(), strip.text.x = element_text(size = 14, family = "serif"), plot.margin = unit(c(0, .2, 0.5, 0.2), "lines")
) +
facet_grid(. ~ cyear)
ggsave(filename = "outputs/106/trh-gsi-plot.pdf", width = 6, height = 4)
rosa_data %>%
mutate(coll_day = yday(COLLECTION_DATE)) %>%
filter(rosa %in% c("EEEEEEEE", "HHHHHHHH", "LLLLLLLL")) %>%
ggplot(., aes(x = coll_day, fill = rosa)) +
geom_bar(stat = "count", width = 1, position = "dodge") +
xlab(label = "Spawning date") +
ylab(label = "Count") +
theme_bw() +
facet_grid(rosa ~ .) +
theme(
axis.text = element_text(size = 12, family = "serif"),
axis.title = element_text(size = 14, family = "serif"),
axis.text.x = element_text(angle = 45, hjust = 1, family = "serif"),
strip.background = element_blank(),
legend.position = "top",
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
strip.text.x = element_text(size = 14, family = "serif"),
strip.text.y = element_blank()
) +
scale_x_continuous(
limits = c(244, 365), breaks = c(245, 275, 306, 335, 365),
labels = c("1 Sep", "1 Oct", "1 Nov", "1 Dec", "1 Jan")
) +
scale_fill_manual(
values = c("gold", "tan2", "blue"),
breaks = c("EEEEEEEE", "HHHHHHHH", "LLLLLLLL"),
labels = c("EE", "EL", "LL"),
name = "RoSA genotype"
) +
geom_rect(
inherit.aes = FALSE, aes(xmin = 285, xmax = 297, ymin = 0, ymax = Inf),
fill = "grey85"
) # draw spawning hiatus period
ggsave(filename = "outputs/106/trh-spawn-date-plot.pdf", width = 6, height = 4)
sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 3.6.2 (2019-12-12)
## os macOS Sierra 10.12.6
## system x86_64, darwin15.6.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/Denver
## date 2020-05-14
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date lib source
## abind 1.4-5 2016-07-21 [1] CRAN (R 3.6.0)
## assertthat 0.2.1 2019-03-21 [1] CRAN (R 3.6.0)
## backports 1.1.6 2020-04-05 [1] CRAN (R 3.6.2)
## broom 0.5.6 2020-04-20 [1] CRAN (R 3.6.2)
## car * 3.0-7 2020-03-11 [1] CRAN (R 3.6.0)
## carData * 3.0-3 2019-11-16 [1] CRAN (R 3.6.0)
## cellranger 1.1.0 2016-07-27 [1] CRAN (R 3.6.0)
## cli 2.0.2 2020-02-28 [1] CRAN (R 3.6.0)
## colorspace 1.4-1 2019-03-18 [1] CRAN (R 3.6.0)
## crayon 1.3.4 2017-09-16 [1] CRAN (R 3.6.0)
## curl 4.3 2019-12-02 [1] CRAN (R 3.6.0)
## data.table 1.12.8 2019-12-09 [1] CRAN (R 3.6.0)
## DBI 1.1.0 2019-12-15 [1] CRAN (R 3.6.0)
## dbplyr 1.4.3 2020-04-19 [1] CRAN (R 3.6.2)
## digest 0.6.25 2020-02-23 [1] CRAN (R 3.6.0)
## dplyr * 0.8.5 2020-03-07 [1] CRAN (R 3.6.0)
## ellipsis 0.3.0 2019-09-20 [1] CRAN (R 3.6.0)
## evaluate 0.14 2019-05-28 [1] CRAN (R 3.6.0)
## fansi 0.4.1 2020-01-08 [1] CRAN (R 3.6.0)
## farver 2.0.3 2020-01-16 [1] CRAN (R 3.6.0)
## forcats * 0.5.0 2020-03-01 [1] CRAN (R 3.6.0)
## foreign 0.8-72 2019-08-02 [2] CRAN (R 3.6.2)
## fs 1.4.1 2020-04-04 [1] CRAN (R 3.6.2)
## generics 0.0.2 2018-11-29 [1] CRAN (R 3.6.0)
## ggplot2 * 3.3.0 2020-03-05 [1] CRAN (R 3.6.0)
## glue 1.4.0 2020-04-03 [1] CRAN (R 3.6.2)
## gtable 0.3.0 2019-03-25 [1] CRAN (R 3.6.0)
## haven 2.2.0 2019-11-08 [1] CRAN (R 3.6.0)
## hms 0.5.3 2020-01-08 [1] CRAN (R 3.6.0)
## htmltools 0.4.0 2019-10-04 [1] CRAN (R 3.6.0)
## httr 1.4.1 2019-08-05 [1] CRAN (R 3.6.0)
## jsonlite 1.6.1 2020-02-02 [1] CRAN (R 3.6.0)
## knitr 1.28 2020-02-06 [1] CRAN (R 3.6.0)
## labeling 0.3 2014-08-23 [1] CRAN (R 3.6.0)
## lattice 0.20-38 2018-11-04 [2] CRAN (R 3.6.2)
## lifecycle 0.2.0 2020-03-06 [1] CRAN (R 3.6.0)
## lubridate * 1.7.8 2020-04-06 [1] CRAN (R 3.6.2)
## magrittr 1.5 2014-11-22 [1] CRAN (R 3.6.0)
## modelr 0.1.6 2020-02-22 [1] CRAN (R 3.6.0)
## munsell 0.5.0 2018-06-12 [1] CRAN (R 3.6.0)
## nlme 3.1-142 2019-11-07 [2] CRAN (R 3.6.2)
## openxlsx 4.1.4 2019-12-06 [1] CRAN (R 3.6.0)
## pillar 1.4.3 2019-12-20 [1] CRAN (R 3.6.0)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 3.6.0)
## purrr * 0.3.4 2020-04-17 [1] CRAN (R 3.6.2)
## R6 2.4.1 2019-11-12 [1] CRAN (R 3.6.0)
## Rcpp 1.0.4 2020-03-17 [1] CRAN (R 3.6.0)
## readr * 1.3.1 2018-12-21 [1] CRAN (R 3.6.0)
## readxl 1.3.1 2019-03-13 [1] CRAN (R 3.6.0)
## reprex 0.3.0 2019-05-16 [1] CRAN (R 3.6.0)
## rio 0.5.16 2018-11-26 [1] CRAN (R 3.6.0)
## rlang 0.4.5 2020-03-01 [1] CRAN (R 3.6.0)
## rmarkdown 2.1 2020-01-20 [1] CRAN (R 3.6.0)
## rstudioapi 0.11 2020-02-07 [1] CRAN (R 3.6.0)
## rvest 0.3.5 2019-11-08 [1] CRAN (R 3.6.0)
## scales 1.1.0 2019-11-18 [1] CRAN (R 3.6.0)
## sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 3.6.0)
## stringi 1.4.6 2020-02-17 [1] CRAN (R 3.6.0)
## stringr * 1.4.0 2019-02-10 [1] CRAN (R 3.6.0)
## tibble * 3.0.1 2020-04-20 [1] CRAN (R 3.6.2)
## tidyr * 1.0.2 2020-01-24 [1] CRAN (R 3.6.0)
## tidyselect 1.0.0 2020-01-27 [1] CRAN (R 3.6.0)
## tidyverse * 1.3.0 2019-11-21 [1] CRAN (R 3.6.0)
## vctrs 0.2.4 2020-03-10 [1] CRAN (R 3.6.0)
## withr 2.2.0 2020-04-20 [1] CRAN (R 3.6.2)
## xfun 0.13 2020-04-13 [1] CRAN (R 3.6.2)
## xml2 1.3.2 2020-04-23 [1] CRAN (R 3.6.2)
## yaml 2.2.1 2020-02-01 [1] CRAN (R 3.6.0)
## zip 2.0.4 2019-09-01 [1] CRAN (R 3.6.0)
##
## [1] /Users/eriq/Library/R/3.6/library
## [2] /Library/Frameworks/R.framework/Versions/3.6/Resources/library
Running the code and rendering this notebook required approximately this much time on a Mac laptop of middling speed:
Sys.time() - start_time
## Time difference of 8.781956 secs