We decided to do a power analysis to answer the question, “what difference in gonadosomatic index values would need to be present in the data to have a significant random intercept effect in model selection between a non-mixed model and a mixed effect model (RoSA=random)?”
library(tidyverse)
library(cowplot)
library(nlme)
dir.create("./outputs/103.1/", recursive = TRUE, showWarnings = FALSE)
rosa_data <- read_rds("./data/101-RoSA-klamath-estuary-samples.rds")
estuary_meta <- read_csv("./data/102-Klamath-entryDATA_withNMFSID.csv")
## Warning: Missing column names filled in: 'X16' [16]
rosa_meta <- left_join(estuary_meta, rosa_data, by = c("NMFS ID" = "NMFS_DNA_ID")) %>%
dplyr::select(-ID) %>%
rename(NMFS_DNA_ID = "NMFS ID") %>%
dplyr::select(NMFS_DNA_ID, rosa, everything())
Remove all samples with ANY missing data. And remove the recombinant genotypes. For this analysis I’m going to use estuary data from 2010 only. This is because there are only 12 EL fish sampled in 2009 which is a really small sample to estimate variance from. The larger EL sample size in 2010 (n=52) will be generate a more robust analysis.
stat_data <- rosa_meta %>%
rename(gonadsi = gsi) %>%
filter(
rosa %in% c("EEEEEEEE", "HHHHHHHH", "LLLLLLLL"),
year == "2010"
) %>%
mutate(
log_est = log(julian),
log_est2 = log_est^2
)
To simulate realistic data I am going to do simple linear regression on the data subsetted by RoSA genotype. From that I can estimate the gonadosomatic index values quite precisely. I’m going to simulate a single years worth of data to keep the power analysis simplified.
m_only <- stat_data %>% filter(rosa == "LLLLLLLL")
lm(gonadsi ~ 1 + log_est + log_est2 + sex, m_only)
##
## Call:
## lm(formula = gonadsi ~ 1 + log_est + log_est2 + sex, data = m_only)
##
## Coefficients:
## (Intercept) log_est log_est2 sexm
## 2.54219 -1.00516 0.10010 -0.01662
To generate variance in the data I’ll need the standard deviations of gonadsi for each sex.
stat_data %>%
filter(rosa == "LLLLLLLL") %>%
group_by(sex) %>%
summarise(
mean_gonad = mean(gonadsi), sd_gonad = sd(gonadsi),
mean_logestuarydate = mean(log_est), sd_logestuarydate = sd(log_est)
)
The linear regression effect values and standard deviations will be used to simulate the LL RoSA data.
int <- 2.54219
log_esteffect <- -1.00516
log_est2effect <- 0.10010
sex_effect <- -.01662
fd1 <- tibble(
logest_fake = rnorm(n = 100, mean = 5.49, sd = 0.08), # generates 100 random estuary entry dates based on mean entry date and SD from 2010 data
logest2_fake = logest_fake^2,
RoSA = "LL",
sex_fake = rep(0:1, each = 50), # create 50 males and 50 females
eps_female = rnorm(n = 100, mean = 0, sd = 0.0144), # error for females
eps_male = rnorm(n = 100, mean = 0, sd = 0.0121), # error for females
gonad_est = int + log_esteffect * logest_fake + log_est2effect * logest2_fake + sex_effect * sex_fake, # produce estimate from linear regression
gonad_fake = ifelse(sex_fake == 1, gonad_est + eps_male, gonad_est + eps_female)
) %>% # add variance into estimate based on sex of individual
filter(gonad_fake > 0)
ggplot(fd1, aes(x = exp(logest_fake), y = gonad_fake, colour = factor(sex_fake))) +
geom_point() +
facet_wrap(sex_fake ~ .) +
ggtitle("female=0")
Cool, that appears to be ok for LL fish.
e_only <- stat_data %>% filter(rosa == "EEEEEEEE")
lm(gonadsi ~ log_est + log_est2 + sex, e_only)
##
## Call:
## lm(formula = gonadsi ~ log_est + log_est2 + sex, data = e_only)
##
## Coefficients:
## (Intercept) log_est log_est2 sexm
## 0.82412 -0.34246 0.03621 -0.01331
To generate variance in the data I’ll need the standard deviations of gonadsi for each sex.
stat_data %>%
filter(rosa == "EEEEEEEE") %>%
group_by(sex) %>%
summarise(
mean_gonad = mean(gonadsi), sd_gonad = sd(gonadsi),
mean_logestuarydate = mean(log_est), sd_logestuarydate = sd(log_est)
)
int <- 0.82412
log_esteffect <- -0.34246
log_est2effect <- 0.03621
sex_effect <- -0.01331
fd2 <- tibble(
logest_fake = rnorm(n = 100, mean = 5.05, sd = 0.11), # mean and SD from 2009 data
logest2_fake = logest_fake^2,
RoSA = "EE",
sex_fake = rep(0:1, each = 50),
eps_female = rnorm(n = 100, mean = 0, sd = 0.0070),
eps_male = rnorm(n = 100, mean = 0, sd = 0.0026),
gonad_est = int + log_esteffect * logest_fake + log_est2effect * logest2_fake + sex_effect * sex_fake,
gonad_fake = ifelse(sex_fake == 1, gonad_est + eps_male, gonad_est + eps_female)
) %>%
filter(gonad_fake > 0)
ggplot(fd2, aes(x = exp(logest_fake), y = gonad_fake, colour = factor(sex_fake))) +
geom_point() +
facet_wrap(sex_fake ~ .)
heterozygote linear regression
h_only <- stat_data %>% filter(rosa == "HHHHHHHH")
lm(gonadsi ~ log_est + log_est2 + sex, h_only)
##
## Call:
## lm(formula = gonadsi ~ log_est + log_est2 + sex, data = h_only)
##
## Coefficients:
## (Intercept) log_est log_est2 sexm
## 0.87697 -0.37153 0.04004 -0.02044
To generate variance in the data I’ll need the standard deviations of gonadsi for each sex.
stat_data %>%
filter(rosa == "HHHHHHHH") %>%
group_by(sex) %>%
summarise(
mean_gonad = mean(gonadsi), sd_gonad = sd(gonadsi),
mean_logestuarydate = mean(log_est), sd_logestuarydate = sd(log_est)
)
int <- 0.87697
log_esteffect <- -0.37153
log_est2effect <- 0.04004
sex_effect <- -0.02044
fd3 <- tibble(
logest_fake = rnorm(n = 50, mean = 5.27, sd = 0.08), # mean and SD from 2009
logest2_fake = logest_fake^2,
RoSA = "EL",
sex_fake = rep(0:1, each = 25),
eps_female = rnorm(n = 50, mean = 0, sd = 0.0184),
eps_male = rnorm(n = 50, mean = 0, sd = 0.0046),
gonad_est = int + log_esteffect * logest_fake + log_est2effect * logest2_fake + sex_effect * sex_fake,
gonad_fake = ifelse(sex_fake == 1, gonad_est + eps_male, gonad_est + eps_female)
) %>%
filter(gonad_fake > 0)
ggplot(fd3, aes(x = exp(logest_fake), y = gonad_fake, colour = factor(sex_fake))) +
geom_point() +
facet_wrap(sex_fake ~ .)
join the simulated EE, EL and LL data frames together and plot it. This plot should look pretty damn similar to the real data.
lm_sim <- bind_rows(fd1, fd2, fd3)
sex_names <- list(
"0" = "Female",
"1" = "Male"
)
plot_labeller <- function(variable, value) {
return(sex_names[value])
}
ggplot(lm_sim, aes(x = exp(logest_fake), y = gonad_fake, colour = RoSA)) +
geom_point() +
facet_wrap(sex_fake ~ .) +
scale_x_continuous(
name = "Estuary sampling date", limits = c(121, 305), breaks = c(121, 152, 182, 213, 244, 274),
labels = c("May-01", "June-01", "July-01", "Aug-01", "Sept-01", "Oct-01")
) +
theme_bw() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
strip.background = element_blank()
) +
facet_grid(. ~ factor(sex_fake), labeller = plot_labeller)
## Warning: The labeller API has been updated. Labellers taking `variable` and
## `value` arguments are now deprecated. See labellers documentation.
## Warning: Removed 1 rows containing missing values (geom_point).
Cool, I think that looks pretty good. Now I’ll just need to add in the estimated early allele effect as the code above simply generates data that is similar to the real gonadosomatic index data.
I’ll assume an additive model here too. For every E allele in the RoSA genotype the gonadosomatic index will increase by X units. This will be straightforward to add into the data generation in a function.
WRAP INTO FUNCTION FOR POWER ANALYSIS. Let’s make a simulated data generation function first.
gonad_data <- function(allele_effect, pop_size) {
# EE data
int <- 0.82412
log_esteffect <- -0.34246
log_est2effect <- 0.03621
sex_effect <- -0.01331
fake_ee <- tibble(
logest_fake = rnorm(n = 100, mean = 5.05, sd = 0.11), # mean and SD from 2010 data
logest2_fake = logest_fake^2,
RoSA = "EE",
sex_fake = rep(0:1, each = 50),
eps_female = rnorm(n = 100, mean = 0, sd = 0.0070),
eps_male = rnorm(n = 100, mean = 0, sd = 0.0026),
gonad_est = int + log_esteffect * logest_fake + log_est2effect * logest2_fake + sex_effect * sex_fake,
gonad_var = ifelse(sex_fake == 1, gonad_est + eps_male, gonad_est + eps_female),
gonad_fake = gonad_var
) %>%
filter(gonad_fake > 0)
# EL data
int <- 0.87697
log_esteffect <- -0.37153
log_est2effect <- 0.04004
sex_effect <- -0.02044
fake_el <- tibble(
logest_fake = rnorm(n = pop_size / 2, mean = 5.27, sd = 0.08),
logest2_fake = logest_fake^2,
RoSA = "EL",
sex_fake = rep(0:1, each = pop_size / 4),
eps_female = rnorm(n = pop_size / 2, mean = 0, sd = 0.0184),
eps_male = rnorm(n = pop_size / 2, mean = 0, sd = 0.0046),
gonad_est = int + log_esteffect * logest_fake + log_est2effect * logest2_fake + sex_effect * sex_fake,
gonad_var = ifelse(sex_fake == 1, gonad_est + eps_male, gonad_est + eps_female),
gonad_fake = gonad_var + allele_effect
) %>%
filter(gonad_fake > 0)
# LL data
int <- 2.54219
log_esteffect <- -1.00516
log_est2effect <- 0.10010
sex_effect <- -.01662
fake_ll <- tibble(
logest_fake = rnorm(n = pop_size, mean = 5.49, sd = 0.08),
logest2_fake = logest_fake^2,
RoSA = "LL",
sex_fake = rep(0:1, each = pop_size / 2),
eps_female = rnorm(n = pop_size, mean = 0, sd = 0.0144),
eps_male = rnorm(n = pop_size, mean = 0, sd = 0.0121),
gonad_est = int + log_esteffect * logest_fake + log_est2effect * logest2_fake + sex_effect * sex_fake,
gonad_var = ifelse(sex_fake == 1, gonad_est + eps_male, gonad_est + eps_female),
gonad_fake = gonad_var + 2 * allele_effect
) %>%
filter(gonad_fake > 0)
lm_sim <- bind_rows(fake_ee, fake_el, fake_ll) %>%
mutate(RoSA = factor(RoSA, levels = c("EE", "EL", "LL")))
lm_sim
}
tmp1 <- gonad_data(allele_effect = 0.02, pop_size = 100) # each L haplo increases gsi by 2 units.
ggplot(tmp1, aes(x = exp(logest_fake), y = gonad_fake, colour = RoSA)) +
geom_point() +
facet_wrap(sex_fake ~ .) +
scale_x_continuous(
name = "Estuary sampling date", limits = c(121, 305), breaks = c(121, 152, 182, 213, 244, 274),
labels = c("May-01", "June-01", "July-01", "Aug-01", "Sept-01", "Oct-01")
) +
theme_bw() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
strip.background = element_blank()
) +
facet_grid(. ~ factor(sex_fake), labeller = plot_labeller)
## Warning: The labeller API has been updated. Labellers taking `variable` and
## `value` arguments are now deprecated. See labellers documentation.
The function appears to be working as I want it to. Let’s see if an early allele effect of 0.02 produces a statistically significant random intercept
logest2Power <- varPower(form = ~logest2_fake)
power_tmp <- gls(gonad_fake ~ 1 + logest_fake + logest2_fake + sex_fake, weights = logest2Power, data = tmp1) # remove the year factor, simulate a single year of data.
power_mm_tmp <- lme(gonad_fake ~ 1 + logest_fake + logest2_fake + sex_fake, random = ~ 1 | RoSA, method = "REML", weights = logest2Power, data = tmp1) # random intercept model
anova(power_tmp, power_mm_tmp)
Make a function that incorporates the model selection between a GLS and mixed effects model (random intercept)
gonad_data_pwr_analysis <- function(allele_effect, pop_size) {
# EE data
int <- 0.82412
log_esteffect <- -0.34246
log_est2effect <- 0.03621
sex_effect <- -0.01331
fake_ee <- tibble(
logest_fake = rnorm(n = 100, mean = 5.05, sd = 0.11), # mean and SD from 2010 data
logest2_fake = logest_fake^2,
RoSA = "EE",
sex_fake = rep(0:1, each = 50),
eps_female = rnorm(n = 100, mean = 0, sd = 0.0070),
eps_male = rnorm(n = 100, mean = 0, sd = 0.0026),
gonad_est = int + log_esteffect * logest_fake + log_est2effect * logest2_fake + sex_effect * sex_fake,
gonad_var = ifelse(sex_fake == 1, gonad_est + eps_male, gonad_est + eps_female),
gonad_fake = gonad_var
) %>%
filter(gonad_fake > 0)
# EL data
int <- 0.87697
log_esteffect <- -0.37153
log_est2effect <- 0.04004
sex_effect <- -0.02044
fake_el <- tibble(
logest_fake = rnorm(n = pop_size / 2, mean = 5.27, sd = 0.08),
logest2_fake = logest_fake^2,
RoSA = "EL",
sex_fake = rep(0:1, each = pop_size / 4),
eps_female = rnorm(n = pop_size / 2, mean = 0, sd = 0.0184),
eps_male = rnorm(n = pop_size / 2, mean = 0, sd = 0.0046),
gonad_est = int + log_esteffect * logest_fake + log_est2effect * logest2_fake + sex_effect * sex_fake,
gonad_var = ifelse(sex_fake == 1, gonad_est + eps_male, gonad_est + eps_female),
gonad_fake = gonad_var + allele_effect
) %>%
filter(gonad_fake > 0)
# LL data
int <- 2.54219
log_esteffect <- -1.00516
log_est2effect <- 0.10010
sex_effect <- -.01662
fake_ll <- tibble(
logest_fake = rnorm(n = pop_size, mean = 5.49, sd = 0.08),
logest2_fake = logest_fake^2,
RoSA = "LL",
sex_fake = rep(0:1, each = pop_size / 2),
eps_female = rnorm(n = pop_size, mean = 0, sd = 0.0144),
eps_male = rnorm(n = pop_size, mean = 0, sd = 0.0121),
gonad_est = int + log_esteffect * logest_fake + log_est2effect * logest2_fake + sex_effect * sex_fake,
gonad_var = ifelse(sex_fake == 1, gonad_est + eps_male, gonad_est + eps_female),
gonad_fake = gonad_var + 2 * allele_effect
) %>%
filter(gonad_fake > 0)
lm_sim <- bind_rows(fake_ee, fake_el, fake_ll) %>%
mutate(RoSA = factor(RoSA, levels = c("EE", "EL", "LL")))
logest2Power <- varPower(form = ~logest2_fake)
power_tmp <- gls(gonad_fake ~ 1 + logest_fake + logest2_fake + sex_fake, weights = logest2Power, data = lm_sim) # remove the year factor, simulate a single year of data.
power_mm_tmp <- lme(gonad_fake ~ 1 + logest_fake + logest2_fake + sex_fake, random = ~ 1 | RoSA, method = "REML", weights = logest2Power, data = lm_sim) # random intercept model
p_val <- anova(power_tmp, power_mm_tmp)$"p-value"[2]
p_val
}
Run model selection 1000 times and see how many times the mixed effects model is preferred over the GLS
rep_outp <- replicate(1e3, gonad_data_pwr_analysis(allele_effect = 0.02, pop_size = 100))
table(rep_outp <= 0.05)
##
## TRUE
## 1000
Wow, I’m surprised by that. But it’s good news for the gonadosomatic index analysis!
What effect size would 80% power be achieved? This is going to be smaller than I expected. Only do 250 simulated data sets for each effect size, here.
alpha <- 0.05
power_table <- tibble(
allele_effect = seq(0.0075, 0.02, by = 0.0025)
) %>%
mutate(power = map_dbl(allele_effect, function(allele_effect) {
ps <- replicate(250, gonad_data_pwr_analysis(allele_effect = allele_effect, pop_size = 100))
mean(ps < alpha)
}))
ggplot(power_table, aes(allele_effect, power)) +
geom_point() +
geom_line() +
scale_x_continuous(limits = c(0.0075, 0.02), breaks = c(0.0075, 0.01, 0.0125, 0.0150, 0.0175, 0.02), labels = c(0.0075, 0.01, 0.0125, 0.0150, 0.0175, 0.02)) +
geom_hline(yintercept = 0.8)
ggsave("./outputs/103.1/allele_effect_gonadsi_power_simulation.pdf")
Wow, so we could find a statistically significant random intercept effect with as little as 0.024 gonadsi units separating EE and LL genotypes.
Now make a 2x2 matrix figure to show what the data would look like with an allele effect of 0.012 per L haplotype.
sex_names <- list(
"0" = "Female",
"1" = "Male"
)
plot_labeller <- function(variable, value) {
return(sex_names[value])
}
Set up some functions to simulate 20 data sets and do all the calculations on them and then keep the first four that are significant (just to see what they look like).
Simulation 1 for the figure. Make sure the ANOVA is significant, it it turns out not to be significant re-run the code chunk until you get a significant result.
plot_simmed_data <- function(tmp1) {
ggplot(tmp1, aes(x = exp(logest_fake), y = gonad_fake, colour = RoSA)) +
geom_point() +
facet_wrap(sex_fake ~ .) +
scale_x_continuous(
name = "Estuary sampling date", limits = c(121, 349), breaks = c(121, 152, 182, 213, 244, 274, 305, 335),
labels = c("May-01", "June-01", "July-01", "Aug-01", "Sept-01", "Oct-01", "Nov-01", "Dec-01")
) +
scale_y_continuous(
name = "fake gonadosomatic index", limits = c(0, 0.15), breaks = c(0, 0.05, 0.10, 0.15),
labels = c(0, 0.05, 0.10, 0.15)
) +
theme_bw() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
strip.background = element_blank()
) +
facet_grid(. ~ factor(sex_fake), labeller = plot_labeller)
}
power_non_mixed <- function(tmp1) {
gls(gonad_fake ~ 1 + logest_fake + logest2_fake + sex_fake, weights = logest2Power, data = tmp1) # remove the year factor, simulate a single year of data.
}
power_mixed <- function(tmp1) {
power_mm_tmp1 <- lme(gonad_fake ~ 1 + logest_fake + logest2_fake + sex_fake, random = ~ 1 | RoSA, method = "REML", weights = logest2Power, data = tmp1) # random intercept model
}
Now, simulate 20 data sets and analyze them. Keep it all in a tibble:
set.seed(25) # for reproducibility
sim_tibble <- tibble(
data_sets = lapply(1:20, function(x) gonad_data(allele_effect = 0.012, pop_size = 100))
) %>%
mutate(
plots = map(data_sets, plot_simmed_data),
power_M = map(data_sets, power_non_mixed),
power_mm = map(data_sets, power_mixed),
anova = map2(power_M, power_mm, .f = anova),
p_value = map_dbl(anova, ~ .$`p-value`[2]),
ranef = map(power_mm, ranef)
)
Now, keep the first 4 that have a p-value < 0.05 and put their plots together:
signif_4 <- sim_tibble %>%
filter(p_value < 0.05) %>%
slice(1:4)
Let’s look at the randome effect intercepts of those:
signif_4$ranef
## [[1]]
## (Intercept)
## EE -0.010820273
## EL 0.002593675
## LL 0.008226598
##
## [[2]]
## (Intercept)
## EE -0.011327176
## EL 0.001659429
## LL 0.009667747
##
## [[3]]
## (Intercept)
## EE -0.011307137
## EL -0.000173805
## LL 0.011480942
##
## [[4]]
## (Intercept)
## EE -0.012122234
## EL 0.003263334
## LL 0.008858900
And the p-values:
signif_4$p_value
## [1] 7.483650e-10 3.225608e-11 8.248858e-11 2.823935e-08
Plot the 4 simulated data sets.
plot_grid(signif_4$plots[[1]],
signif_4$plots[[2]],
signif_4$plots[[3]],
signif_4$plots[[4]],
ncol = 2
)
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
ggsave("./outputs/103.1/simulated_gonadosomatic_index_sampling_date_faceted.pdf", width = 16, height = 10)
This plot graphically represents the difference in gonadosomatic index among EE, EL and LL genotypes that would need to be present to find a statistically significant random effect at 80% power.
Running the code and rendering this notebook required approximately this much time on a Mac laptop of middling speed:
Sys.time() - start_time
## Time difference of 1.796038 mins