library(tidyverse)
library(cowplot)
library(nlme)
dir.create("./outputs/104.1/", recursive = TRUE, showWarnings = FALSE)
The idea behind this is to determine what difference in mean fatness would need to be present in the data between homozygotes given the variance in NWF we measured to find a statistically significant random intercept at 80% power?
load RoSA genotype data and add estuary metadata to it.
rosa_data <- read_rds("./data/101-RoSA-klamath-estuary-samples.rds")
estuary_meta <- read_csv("./data/102-Klamath-entryDATA_withNMFSID.csv")
## Warning: Missing column names filled in: 'X16' [16]
rosa_meta <- left_join(estuary_meta, rosa_data, by = c("NMFS ID" = "NMFS_DNA_ID")) %>%
dplyr::select(-ID) %>%
rename(NMFS_DNA_ID = "NMFS ID") %>%
dplyr::select(NMFS_DNA_ID, rosa, everything())
Remove all samples with ANY missing data. And remove the recombinant genotypes.
stat_data <- rosa_meta %>%
rename(fatness = drywet) %>%
filter(rosa %in% c("EEEEEEEE", "HHHHHHHH", "LLLLLLLL")) %>%
mutate(
log_est = log(julian),
year_f = factor(year, levels = c("2009", "2010"))
)
Let’s say that early-homozygote RoSA increase to 5% greater than late-homozygote RoSA. In this case each E allele increases fatness by 0.025 units (simple additive model). This is the assumption we are making to start. I’ll vary the additive allele effect in a more systematic way later, but I just want to get the code working for right now.
Creating fake data for each genotype: To do this I will subset the stat_data by RoSA genotype and run a simple linear regression of the form NWF~ intercept +b1log_estuary_date + b2sex + b3*year That will give me effect sizes for each RoSA genotype to most closely mimic the real data. Each fake dataset per genotype will be generated with the genotype specific linear regression effect values.
For creating estuary sampling dates I will randomly sample a normal distribution with mean equal to the mean and standard deviation corresponding to the mean and SD of the real data for each genotype.
I’ll create a balanced number of fish per homozygous genotype (n=100)and half that for heterozygotes to match the observed data in 2010. We’ll simulate this in a single year (no need to create a year factor like the analysis with real data)
Calculate the mean and sd for log estuary date and NWF by sex and genotype
stat_data %>%
group_by(rosa, sex) %>%
summarise(
mean_fat = mean(fatness),
sd_fat = sd(fatness),
mean_logest = mean(log_est),
sd_logest = sd(log_est)
)
stat_data %>%
filter(rosa == "EEEEEEEE") %>%
lm(fatness ~ log_est + sex + year_f, data = .)
##
## Call:
## lm(formula = fatness ~ log_est + sex + year_f, data = .)
##
## Coefficients:
## (Intercept) log_est sexm year_f2010
## 0.85635 -0.10850 0.02683 0.03168
pop_size <- 100
int <- 0.85635
log_esteffect <- -0.10850
sex_effect <- 0.02683
allele_effect <- 0
sim_early <- tibble(
sex_fake = rep(c(1, 0), each = pop_size / 2), # 1 =male
logest_fake = rnorm(n = pop_size, mean = 5.03, sd = 0.112),
eps_male = rnorm(n = pop_size, mean = 0, sd = 0.052),
eps_female = rnorm(n = pop_size, mean = 0, sd = 0.051),
fat_est = int + log_esteffect * logest_fake + sex_effect * sex_fake,
fat_var = ifelse(sex_fake == 1, fat_est + eps_male, fat_est + eps_female),
fat_fake = fat_var + 2 * allele_effect,
RoSA = "EE"
)
stat_data %>%
filter(rosa == "HHHHHHHH") %>%
lm(fatness ~ log_est + sex + year_f, data = .)
##
## Call:
## lm(formula = fatness ~ log_est + sex + year_f, data = .)
##
## Coefficients:
## (Intercept) log_est sexm year_f2010
## 0.003811 0.055063 0.044197 0.034599
pop_size <- 100
int <- 0.003811
log_esteffect <- 0.055063
sex_effect <- 0.044197
sim_het <- tibble(
sex_fake = rep(c(1, 0), each = pop_size / 4), # 1 =male
logest_fake = rnorm(n = pop_size / 2, mean = 5.27, sd = 0.079),
eps_male = rnorm(n = pop_size / 2, mean = 0, sd = 0.049),
eps_female = rnorm(n = pop_size / 2, mean = 0, sd = 0.038),
fat_est = int + log_esteffect * logest_fake + sex_effect * sex_fake,
fat_var = ifelse(sex_fake == 1, fat_est + eps_male, fat_est + eps_female),
fat_fake = fat_var + allele_effect,
RoSA = "EL"
)
stat_data %>%
filter(rosa == "LLLLLLLL") %>%
lm(fatness ~ log_est + sex + year_f, data = .)
##
## Call:
## lm(formula = fatness ~ log_est + sex + year_f, data = .)
##
## Coefficients:
## (Intercept) log_est sexm year_f2010
## 1.17710 -0.16408 0.03727 0.02529
int <- 1.1771
log_esteffect <- -0.16408
sex_effect <- 0.03727
sim_late <- tibble(
sex_fake = rep(c(1, 0), each = pop_size / 2), # 1 =male
logest_fake = rnorm(n = pop_size, mean = 5.54, sd = 0.090),
eps_male = rnorm(n = pop_size, mean = 0, sd = 0.055),
eps_female = rnorm(n = pop_size, mean = 0, sd = 0.036),
fat_est = int + log_esteffect * logest_fake + sex_effect * sex_fake,
fat_var = ifelse(sex_fake == 1, fat_est + eps_male, fat_est + eps_female),
fat_fake = fat_var + 2 * allele_effect,
RoSA = "LL"
)
sim_data <- bind_rows(sim_early, sim_het, sim_late) %>%
mutate(RoSA = factor(RoSA, levels = c("EE", "EL", "LL")))
ggplot(sim_data, aes(x = logest_fake, y = fat_fake, colour = RoSA)) +
geom_point() +
facet_grid(sex_fake ~ .)
library(nlme)
power_M0 <- gls(fat_fake ~ 1 + logest_fake + sex_fake, data = sim_data) # remove the year factor, simulate a single year of data.
power_mm0 <- lme(fat_fake ~ 1 + logest_fake + sex_fake, random = ~ 1 | RoSA, method = "REML", data = sim_data) # random intercept model
anova(power_M0, power_mm0)$"p-value"[2]
## [1] 0.2074184
Not even close.
ggplot(sim_data, aes(x = RoSA, y = fat_fake)) +
geom_boxplot()
Combine the above code into a function so we can use run a simulation analyses 1,000 or 10,000 times.
fat_power <- function(pop_size, allele_effect) {
pop_size <- 100
allele_effect <- allele_effect
# EE specific effect sizes from linear regression
int <- 0.85635
log_esteffect <- -0.10850
sex_effect <- 0.02683
sim_early <- tibble(
sex_fake = rep(c(1, 0), each = pop_size / 2), # 1 =male
logest_fake = rnorm(n = pop_size, mean = 5.03, sd = 0.112),
eps_male = rnorm(n = pop_size, mean = 0, sd = 0.052),
eps_female = rnorm(n = pop_size, mean = 0, sd = 0.051),
fat_est = int + log_esteffect * logest_fake + sex_effect * sex_fake,
fat_var = ifelse(sex_fake == 1, fat_est + eps_male, fat_est + eps_female),
fat_fake = fat_var + 2 * allele_effect,
RoSA = "EE"
)
# EL specific effect sizes from linear regression
int <- 0.003811
log_esteffect <- 0.055063
sex_effect <- 0.044197
sim_het <- tibble(
sex_fake = rep(c(1, 0), each = pop_size / 4), # 1 =male
logest_fake = rnorm(n = pop_size / 2, mean = 5.27, sd = 0.079),
eps_male = rnorm(n = pop_size / 2, mean = 0, sd = 0.049),
eps_female = rnorm(n = pop_size / 2, mean = 0, sd = 0.038),
fat_est = int + log_esteffect * logest_fake + sex_effect * sex_fake,
fat_var = ifelse(sex_fake == 1, fat_est + eps_male, fat_est + eps_female),
fat_fake = fat_var + allele_effect,
RoSA = "EL"
)
# LL specific effect sizes from linear regression
int <- 1.1771
log_esteffect <- -0.16408
sex_effect <- 0.03727
sim_late <- tibble(
sex_fake = rep(c(1, 0), each = pop_size / 2), # 1 =male
logest_fake = rnorm(n = pop_size, mean = 5.54, sd = 0.090),
eps_male = rnorm(n = pop_size, mean = 0, sd = 0.055),
eps_female = rnorm(n = pop_size, mean = 0, sd = 0.036),
fat_est = int + log_esteffect * logest_fake + sex_effect * sex_fake,
fat_fake = ifelse(sex_fake == 1, fat_est + eps_male, fat_est + eps_female),
RoSA = "LL"
)
sim_data <- bind_rows(sim_early, sim_het, sim_late) %>%
mutate(RoSA = factor(RoSA, levels = c("EE", "EL", "LL")))
power_M0 <- gls(fat_fake ~ 1 + logest_fake + sex_fake, data = sim_data) # remove the year factor, simulate a single year of data.
power_mm0 <- lme(fat_fake ~ 1 + logest_fake + sex_fake, random = ~ 1 | RoSA, method = "REML", data = sim_data) # random intercept model
p_val <- anova(power_M0, power_mm0)$"p-value"[2]
p_val
}
rep_outp <- replicate(1e3, fat_power(allele_effect = 0.025, pop_size = 100))
table(rep_outp <= 0.05)
##
## FALSE TRUE
## 780 220
Well, that isn’t even close to the 80% power we’d like to see.
What effect size would 80% power be achieved?
alpha <- 0.05
power_table <- tibble(
allele_effect = seq(0.025, 0.05, by = 0.005)
) %>%
mutate(power = map_dbl(allele_effect, function(allele_effect) {
ps <- replicate(1e3, fat_power(allele_effect = allele_effect, pop_size = 100))
mean(ps < alpha)
}))
ggplot(power_table, aes(allele_effect, power)) +
geom_smooth() +
geom_point() +
geom_hline(yintercept = 0.8)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : Chernobyl! trL>n 6
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : Chernobyl! trL>n 6
## Warning in sqrt(sum.squares/one.delta): NaNs produced
## Warning in stats::qt(level/2 + 0.5, pred$df): NaNs produced
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
ggsave("./outputs/104.1/allele_effect_fatness_power_simulation.pdf")
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : Chernobyl! trL>n 6
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : Chernobyl! trL>n 6
## Warning in sqrt(sum.squares/one.delta): NaNs produced
## Warning in stats::qt(level/2 + 0.5, pred$df): NaNs produced
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
To detect a significant random intercept the difference in fatness among homozygotes would need to be about 8.5%. The early allele effect would be +4.25% fatness
We want to make plots of of simulated data sets in which RoSA genotype has a significant effect on NWF, so we can see what the looks like visually.
We simulate data sets using an allele effect that gives us 80% power. We simulate 20 data sets (with 80% power than we are certain that at least 4 of those will yield significant effects of RoSA on fatness) and then plot the first four that are significant.
Here is a function to make simulated data.
fat_fake_data <- function(allele_effect, pop_size) {
pop_size <- 100
allele_effect <- allele_effect
# EE specific effect sizes from linear regression
int <- 0.85635
log_esteffect <- -0.10850
sex_effect <- 0.02683
sim_early <- tibble(
sex_fake = rep(c(1, 0), each = pop_size / 2), # 1 =male
logest_fake = rnorm(n = pop_size, mean = 5.03, sd = 0.112),
eps_male = rnorm(n = pop_size, mean = 0, sd = 0.052),
eps_female = rnorm(n = pop_size, mean = 0, sd = 0.051),
fat_est = int + log_esteffect * logest_fake + sex_effect * sex_fake,
fat_var = ifelse(sex_fake == 1, fat_est + eps_male, fat_est + eps_female),
fat_fake = fat_var + 2 * allele_effect,
RoSA = "EE"
)
# EL specific effect sizes from linear regression
int <- 0.003811
log_esteffect <- 0.055063
sex_effect <- 0.044197
sim_het <- tibble(
sex_fake = rep(c(1, 0), each = pop_size / 4), # 1 =male
logest_fake = rnorm(n = pop_size / 2, mean = 5.27, sd = 0.079),
eps_male = rnorm(n = pop_size / 2, mean = 0, sd = 0.049),
eps_female = rnorm(n = pop_size / 2, mean = 0, sd = 0.038),
fat_est = int + log_esteffect * logest_fake + sex_effect * sex_fake,
fat_var = ifelse(sex_fake == 1, fat_est + eps_male, fat_est + eps_female),
fat_fake = fat_var + allele_effect,
RoSA = "EL"
)
# LL specific effect sizes from linear regression
int <- 1.1771
log_esteffect <- -0.16408
sex_effect <- 0.03727
sim_late <- tibble(
sex_fake = rep(c(1, 0), each = pop_size / 2), # 1 =male
logest_fake = rnorm(n = pop_size, mean = 5.54, sd = 0.090),
eps_male = rnorm(n = pop_size, mean = 0, sd = 0.055),
eps_female = rnorm(n = pop_size, mean = 0, sd = 0.036),
fat_est = int + log_esteffect * logest_fake + sex_effect * sex_fake,
fat_fake = ifelse(sex_fake == 1, fat_est + eps_male, fat_est + eps_female),
RoSA = "LL"
)
sim_data <- bind_rows(sim_early, sim_het, sim_late) %>%
mutate(RoSA = factor(RoSA, levels = c("EE", "EL", "LL")))
sim_data
}
To make a plot similar to the real data, we want a labeller function.
sex_names <- list(
"0" = "Female",
"1" = "Male"
)
plot_labeller <- function(variable, value) {
return(sex_names[value])
}
And we also want a function that returns a ggplot
ggplot_fake_fat_data <- function(dat2plot) {
ggplot(dat2plot, aes(x = exp(logest_fake), y = fat_fake, colour = RoSA)) +
geom_point() +
scale_y_continuous(name = "Simulated Fatness", limits = c(0.15, .6)) +
scale_x_continuous(
name = "Estuary sampling date", limits = c(121, 305), breaks = c(121, 152, 182, 213, 244, 274),
labels = c("May-01", "June-01", "July-01", "Aug-01", "Sept-01", "Oct-01")
) +
theme_bw() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
strip.background = element_blank()
) +
facet_grid(. ~ factor(sex_fake), labeller = plot_labeller)
}
And we also want a function that analyzes the fat fake data
power_M1_func <- function(dat2plot) {
power_M1 <- gls(fat_fake ~ 1 + logest_fake + sex_fake, data = dat2plot) # remove the year factor, simulate a single year of data.
}
power_mm1_func <- function(dat2plot) {
power_mm1 <- lme(fat_fake ~ 1 + logest_fake + sex_fake, random = ~ 1 | RoSA, method = "REML", data = dat2plot) # random intercept model
}
And now we can make a tibble that stores 20 simulated data sets and the outputs of their analyses. We will use some list columns:
set.seed(5) # for reproducibility
sim_tibble <- tibble(
data_sets = lapply(1:10, function(x) fat_fake_data(allele_effect = 0.0425, pop_size = 100))
) %>%
mutate(
plots = map(data_sets, ggplot_fake_fat_data),
power_M1 = map(data_sets, power_M1_func),
power_mm1 = map(data_sets, power_mm1_func),
anova = map2(.x = power_M1, .y = power_mm1, .f = anova),
ranef = map(power_mm1, ranef),
signif = map_dbl(anova, ~ .$`p-value`[2])
)
Now, take the first four that have a p-value < 0.05:
signif_4 <- sim_tibble %>%
filter(signif < 0.05) %>%
slice(1:4)
Let’s look at the intercepts of those:
signif_4$ranef
## [[1]]
## (Intercept)
## EE 0.03452647
## EL 0.01149402
## LL -0.04602049
##
## [[2]]
## (Intercept)
## EE 0.02983694
## EL 0.01245745
## LL -0.04229439
##
## [[3]]
## (Intercept)
## EE 0.036790405
## EL 0.008765027
## LL -0.045555432
##
## [[4]]
## (Intercept)
## EE 0.02648907
## EL -0.00243971
## LL -0.02404936
And the significances:
signif_4$signif
## [1] 2.047561e-05 1.080046e-04 4.427026e-05 1.450386e-02
Now we can plot the 4 simulated data sets.
plot_grid(signif_4$plots[[1]],
signif_4$plots[[2]],
signif_4$plots[[3]],
signif_4$plots[[4]],
ncol = 2
)
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 7 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing missing values (geom_point).
ggsave("./outputs/104.1/simulated_fatness_sampling_date_faceted.pdf", width = 11)
This plot graphically represents the difference in NWF among EE, EL and LL genotypes that would need to be present to find a statistically significant random effect at 80% power.
Running the code and rendering this notebook required approximately this much time on a Mac laptop of middling speed:
Sys.time() - start_time
## Time difference of 2.161547 mins