library(tidyverse)
library(car)
rosa_data <- read_rds("./data/101-RoSA-klamath-estuary-samples.rds")
klaten_nomiss <- rosa_data %>%
distinct(Indiv, .keep_all = T) %>%
filter(!str_detect(rosa, "\\?"))
There are 505 at-entry estuary samples with 100% complete genotypes. There are 515 Klamath at-entry estuary samples.
estuary_meta <- read_csv("./data/102-Klamath-entryDATA_withNMFSID.csv")
## Warning: Missing column names filled in: 'X16' [16]
rosa_meta <- left_join(estuary_meta, rosa_data, by = c("NMFS ID" = "NMFS_DNA_ID")) %>%
dplyr::select(-ID) %>%
rename(NMFS_DNA_ID = "NMFS ID") %>%
dplyr::select(NMFS_DNA_ID, rosa, everything())
Number of rosa genotypes in the data
rosa_meta %>%
filter(!str_detect(rosa, "\\?")) %>%
count(rosa)
Visualize the relationship between RoSA and sampling date
rosa_meta %>%
filter(!str_detect(rosa, "\\?")) %>%
mutate(rosa = factor(rosa, levels = c("EEEEEEEE", "HHHHHHHH", "LLLLLLLL", "HHHEEEEE", "LLLLHHLL"))) %>%
ggplot(., aes(x = julian, color = rosa, fill = rosa)) +
geom_histogram(binwidth = 1) +
xlab("julian day") +
theme_bw() +
facet_grid(year ~ .)
ANOVA analysis
rosa_stats <- rosa_meta %>%
filter(
!str_detect(rosa, "\\?"),
rosa %in% c("EEEEEEEE", "HHHHHHHH", "LLLLLLLL")
) %>% # remove samples with missing data and recombinant genotypes
mutate(
rosa_f = factor(rosa, levels = c("EEEEEEEE", "HHHHHHHH", "LLLLLLLL")),
year_f = factor(year, levels = c("2009", "2010"))
)
lm1 <- lm(julian ~ rosa_f * year_f, rosa_stats)
anova(lm1)
Genotype has a significant effect on sampling date, year has a statistically significant effect on sample date and there is a significant year by genotype effect.
Let’s look at the effect sizes
summary(lm1)
##
## Call:
## lm(formula = julian ~ rosa_f * year_f, data = rosa_stats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66.029 -13.837 -1.029 14.113 48.163
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 149.837 1.929 77.662 < 2e-16 ***
## rosa_fHHHHHHHH 42.330 5.680 7.453 4.10e-13 ***
## rosa_fLLLLLLLL 114.192 2.487 45.912 < 2e-16 ***
## year_f2010 7.525 2.643 2.847 0.00459 **
## rosa_fHHHHHHHH:year_f2010 -4.653 6.489 -0.717 0.47366
## rosa_fLLLLLLLL:year_f2010 -27.769 3.578 -7.760 4.87e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.51 on 496 degrees of freedom
## Multiple R-squared: 0.8722, Adjusted R-squared: 0.8709
## F-statistic: 677 on 5 and 496 DF, p-value: < 2.2e-16
In 2010 the LLLLLLLL genotype arrived approximately 28 days earlier than LLLLLLLL in 2009. There was also a year effect, where in 2010 on average all fish arrive 7.5 days earlier. That isn’t a big difference when we’re considering a entry season of 173 days.
Checking model assumptions
plot(lm1) # looks ok.
eij <- residuals(lm1)
hist(eij, main = "Histogram of residuals") # looks ok
checking homoscedasticity
rosa_stats %>%
group_by(rosa_f, year_f) %>%
summarise(
mean_jul = mean(julian),
variance = round(var(julian), 2),
stdev = sd(julian),
n_fish = n()
) %>%
arrange(year_f, rosa_f)
rosa_stats09 <- rosa_stats %>% filter(year_f == "2009")
leveneTest(rosa_stats09$julian ~ rosa_stats09$rosa_f) # violation of homoscedasticity
rosa_stats10 <- rosa_stats %>% filter(year_f == "2010")
leveneTest(rosa_stats10$julian ~ rosa_stats10$rosa_f) # no violation of homoscedasticity
Whats the % variance explained by the interaction?
anova1 <- anova(lm1)
sumSq_interaction <- anova1$`Sum Sq`[3]
sumSq_total <- sum(anova1$`Sum Sq`)
The interaction between year and genotype category accounts for 1.625837 of the total variance. This seems really small. Let’s figure out what the variance % explained by year and by genotype is to compare to this.
sumSq_genotype <- anova1$`Sum Sq`[1]
sumSq_year <- anova1$`Sum Sq`[2]
RoSA genotype explains 85.1900908 percent of the total variance
Year explains 0.4033037 percent of the total variance.
OK, so big picture here. Genotype has the strongest effect by a large margin. There is a statistically significant result of the interaction between year and genotype, but it explains very little of the total variance and does not change the overall pattern of EEEEEEEE enter first, followed by HHHHHHHH and LLLLLLLL genotypes. Also note the interaction was only significant for LLLLLLLL genotypes in 2010.
Simulations to determine how big an issue the homoscedasticity violation is on the ANOVA result. 2009 had larger differences in standard deviation and sample size among RoSA genotypes so I’ll start there. Using the Levene’s test there was no violation of homoscedasticity in 2010.
nSims <- 10000
h0 <- numeric(nSims)
for (i in 1:nSims) {
x <- rnorm(n = 100, mean = 150, sd = 16.6) # represents mean and SD of EEEEEEEE RoSA in 2009
y <- rnorm(n = 100, mean = 192, sd = 12.2) # represents mean and SD of HHHHHHHHH RoSA in 2009
z <- t.test(x, y, var.equal = T)
h0[i] <- z$p.value
}
hist(h0, main = "Histogram of p-values with observed mean and variance, = n", xlab = ("Observed p-value"), breaks = 100)
# now lets say that the HHHHHHHH RoSA had equivalent SD to EEEEEEEE (increasing SD of HHHHHHHH higher than observed)
nSims <- 10000
h0 <- numeric(nSims)
for (i in 1:nSims) {
x <- rnorm(n = 100, mean = 150, sd = 16.6) # represents mean and SD of EEEEEEEE RoSA in 2009
y <- rnorm(n = 100, mean = 192, sd = 16.6) # represents mean and inflated SD of HHHHHHHHH RoSA in 2009
z <- t.test(x, y, var.equal = T)
h0[i] <- z$p.value
}
hist(h0, main = "Histogram of p-values with observed mean and inflated H variance, = n", xlab = ("Observed p-value"), breaks = 100)
Ok, so equal sample size among EEEEEEEE and HHHHHHHH and changing the SD of each group doesn’t have any influence on the distribution of P-values.
Now I’ll consider changing the sample size AND variance to see how that influences the P-value distribution.
nSims <- 10000
h0 <- numeric(nSims)
for (i in 1:nSims) {
x <- rnorm(n = 100, mean = 150, sd = 16.6) # represents mean and SD of EEEEEEEE RoSA in 2009
y <- rnorm(n = 10, mean = 192, sd = 12.2) # represents mean and SD of HHHHHHHHH RoSA in 2009
z <- t.test(x, y, var.equal = T)
h0[i] <- z$p.value
}
hist(h0, main = "Histogram of p-values with observed mean and variance, diff n", xlab = ("Observed p-value"), breaks = 100)
# now lets say that the HHHHHHHH RoSA had equivalent SD to EEEEEEEE (increasing SD of HHHHHHHH higher than observed)
nSims <- 10000
h0 <- numeric(nSims)
for (i in 1:nSims) {
x <- rnorm(n = 100, mean = 150, sd = 16.6) # represents mean and SD of EEEEEEEE RoSA in 2009
y <- rnorm(n = 10, mean = 192, sd = 16.6) # represents mean, inflated SD of HHHHHHHHH RoSA in 2009
z <- t.test(x, y, var.equal = T)
h0[i] <- z$p.value
}
hist(h0, main = "Histogram of p-values with observed mean and increased variance, diff n", xlab = ("Observed p-value"), breaks = 100)
The distribution of P-values comparing EEEEEEEE and HHHHHHHH was not changed much at all from varying the standard deviation and the sample sizes among groups using the mean julian day of sampling. I’m confident the violation of homoscedasticity is not influencing the significant difference in julian day among EEEEEEEE and HHHHHHHH RoSA genos in 2009 in a magnitude that we need to worry about.
Now let’s compare HHHHHHHH and LLLLLLLL.
nSims <- 10000
h0 <- numeric(nSims)
for (i in 1:nSims) {
x <- rnorm(n = 100, mean = 192, sd = 12.2) # represents mean and SD of HHHHHHHHH RoSA in 2009
y <- rnorm(n = 100, mean = 264, sd = 21.4) # represents mean and SD of LLLLLLLL RoSA in 2009
z <- t.test(x, y, var.equal = T)
h0[i] <- z$p.value
}
hist(h0, main = "Histogram of p-values with observed mean and variance, equal n", xlab = ("Observed p-value"), breaks = 100)
# now lets say that the HHHHHHHH RoSA had equivalent SD to LLLLLLLL (increasing SD of HHHHHHHH higher than observed)
nSims <- 10000
h0 <- numeric(nSims)
for (i in 1:nSims) {
x <- rnorm(n = 100, mean = 192, sd = 21.4) # represents mean julian of HHHHHHHHH RoSA in 2009 and sd of LLLLLLLL in 2009
y <- rnorm(n = 100, mean = 264, sd = 21.4) # represents mean and SD of HHHHHHHHH RoSA in 2009
z <- t.test(x, y, var.equal = T)
h0[i] <- z$p.value
}
hist(h0, main = "Histogram of p-values with observed mean and inflated H variance, equal n", xlab = ("Observed p-value"), breaks = 100)
If HHHHHHHH and LLLLLLLL had equal sample sizes the difference in variance among the genotypes wouldn’t strongly influence the mean difference in julian day.
Now I’ll incorporate a difference in group size AND variance.
nSims <- 10000
h0 <- numeric(nSims)
for (i in 1:nSims) {
x <- rnorm(n = 10, mean = 192, sd = 21.4) # represents mean and n HHHHHHHHH RoSA in 2009 with SD of LLLLLLLL in 09
y <- rnorm(n = 150, mean = 264, sd = 21.4) # represents mean, n and SD of LLLLLLLL RoSA in 2009
z <- t.test(x, y, var.equal = T)
h0[i] <- z$p.value
}
hist(h0, main = "Histogram of p-values with observed mean, changed variance for H and unequal n", xlab = ("Observed p-value"), breaks = 100)
Again, the distribution of P-values is still very significant and not greatly influenced by the difference in sample size or standard deviation among RoSA genotypes. Because the distribution of P-values doesn’t appear to be greatly influenced by violations of homoscedasticity with observed sample sizes I’m confident in the results of the ANOVA. Note, the larger difference in mean julian day among EEEEEEEE and LLLLLLLL means the results shouldn’t be infleunced by differences in sample size or SD. Also the EEEEEEEE and LLLLLLLL groups had the most similar n and sd in 2009.
sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 3.6.2 (2019-12-12)
## os macOS Sierra 10.12.6
## system x86_64, darwin15.6.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/Denver
## date 2020-05-14
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date lib source
## abind 1.4-5 2016-07-21 [1] CRAN (R 3.6.0)
## assertthat 0.2.1 2019-03-21 [1] CRAN (R 3.6.0)
## backports 1.1.6 2020-04-05 [1] CRAN (R 3.6.2)
## broom 0.5.6 2020-04-20 [1] CRAN (R 3.6.2)
## car * 3.0-7 2020-03-11 [1] CRAN (R 3.6.0)
## carData * 3.0-3 2019-11-16 [1] CRAN (R 3.6.0)
## cellranger 1.1.0 2016-07-27 [1] CRAN (R 3.6.0)
## cli 2.0.2 2020-02-28 [1] CRAN (R 3.6.0)
## colorspace 1.4-1 2019-03-18 [1] CRAN (R 3.6.0)
## crayon 1.3.4 2017-09-16 [1] CRAN (R 3.6.0)
## curl 4.3 2019-12-02 [1] CRAN (R 3.6.0)
## data.table 1.12.8 2019-12-09 [1] CRAN (R 3.6.0)
## DBI 1.1.0 2019-12-15 [1] CRAN (R 3.6.0)
## dbplyr 1.4.3 2020-04-19 [1] CRAN (R 3.6.2)
## digest 0.6.25 2020-02-23 [1] CRAN (R 3.6.0)
## dplyr * 0.8.5 2020-03-07 [1] CRAN (R 3.6.0)
## ellipsis 0.3.0 2019-09-20 [1] CRAN (R 3.6.0)
## evaluate 0.14 2019-05-28 [1] CRAN (R 3.6.0)
## fansi 0.4.1 2020-01-08 [1] CRAN (R 3.6.0)
## farver 2.0.3 2020-01-16 [1] CRAN (R 3.6.0)
## forcats * 0.5.0 2020-03-01 [1] CRAN (R 3.6.0)
## foreign 0.8-72 2019-08-02 [2] CRAN (R 3.6.2)
## fs 1.4.1 2020-04-04 [1] CRAN (R 3.6.2)
## generics 0.0.2 2018-11-29 [1] CRAN (R 3.6.0)
## ggplot2 * 3.3.0 2020-03-05 [1] CRAN (R 3.6.0)
## glue 1.4.0 2020-04-03 [1] CRAN (R 3.6.2)
## gtable 0.3.0 2019-03-25 [1] CRAN (R 3.6.0)
## haven 2.2.0 2019-11-08 [1] CRAN (R 3.6.0)
## hms 0.5.3 2020-01-08 [1] CRAN (R 3.6.0)
## htmltools 0.4.0 2019-10-04 [1] CRAN (R 3.6.0)
## httr 1.4.1 2019-08-05 [1] CRAN (R 3.6.0)
## jsonlite 1.6.1 2020-02-02 [1] CRAN (R 3.6.0)
## knitr 1.28 2020-02-06 [1] CRAN (R 3.6.0)
## labeling 0.3 2014-08-23 [1] CRAN (R 3.6.0)
## lattice 0.20-38 2018-11-04 [2] CRAN (R 3.6.2)
## lifecycle 0.2.0 2020-03-06 [1] CRAN (R 3.6.0)
## lubridate 1.7.8 2020-04-06 [1] CRAN (R 3.6.2)
## magrittr 1.5 2014-11-22 [1] CRAN (R 3.6.0)
## modelr 0.1.6 2020-02-22 [1] CRAN (R 3.6.0)
## munsell 0.5.0 2018-06-12 [1] CRAN (R 3.6.0)
## nlme 3.1-142 2019-11-07 [2] CRAN (R 3.6.2)
## openxlsx 4.1.4 2019-12-06 [1] CRAN (R 3.6.0)
## pillar 1.4.3 2019-12-20 [1] CRAN (R 3.6.0)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 3.6.0)
## purrr * 0.3.4 2020-04-17 [1] CRAN (R 3.6.2)
## R6 2.4.1 2019-11-12 [1] CRAN (R 3.6.0)
## Rcpp 1.0.4 2020-03-17 [1] CRAN (R 3.6.0)
## readr * 1.3.1 2018-12-21 [1] CRAN (R 3.6.0)
## readxl 1.3.1 2019-03-13 [1] CRAN (R 3.6.0)
## reprex 0.3.0 2019-05-16 [1] CRAN (R 3.6.0)
## rio 0.5.16 2018-11-26 [1] CRAN (R 3.6.0)
## rlang 0.4.5 2020-03-01 [1] CRAN (R 3.6.0)
## rmarkdown 2.1 2020-01-20 [1] CRAN (R 3.6.0)
## rstudioapi 0.11 2020-02-07 [1] CRAN (R 3.6.0)
## rvest 0.3.5 2019-11-08 [1] CRAN (R 3.6.0)
## scales 1.1.0 2019-11-18 [1] CRAN (R 3.6.0)
## sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 3.6.0)
## stringi 1.4.6 2020-02-17 [1] CRAN (R 3.6.0)
## stringr * 1.4.0 2019-02-10 [1] CRAN (R 3.6.0)
## tibble * 3.0.1 2020-04-20 [1] CRAN (R 3.6.2)
## tidyr * 1.0.2 2020-01-24 [1] CRAN (R 3.6.0)
## tidyselect 1.0.0 2020-01-27 [1] CRAN (R 3.6.0)
## tidyverse * 1.3.0 2019-11-21 [1] CRAN (R 3.6.0)
## vctrs 0.2.4 2020-03-10 [1] CRAN (R 3.6.0)
## withr 2.2.0 2020-04-20 [1] CRAN (R 3.6.2)
## xfun 0.13 2020-04-13 [1] CRAN (R 3.6.2)
## xml2 1.3.2 2020-04-23 [1] CRAN (R 3.6.2)
## yaml 2.2.1 2020-02-01 [1] CRAN (R 3.6.0)
## zip 2.0.4 2019-09-01 [1] CRAN (R 3.6.0)
##
## [1] /Users/eriq/Library/R/3.6/library
## [2] /Library/Frameworks/R.framework/Versions/3.6/Resources/library
Running the code and rendering this notebook required approximately this much time on a Mac laptop of middling speed:
Sys.time() - start_time
## Time difference of 12.51154 secs