library(tidyverse)
library(cowplot)
library(nlme)

dir.create("./outputs/104.1/", recursive = TRUE, showWarnings = FALSE)

The idea behind this is to determine what difference in mean fatness would need to be present in the data between homozygotes given the variance in NWF we measured to find a statistically significant random intercept at 80% power?

Develop the power analysis

load RoSA genotype data and add estuary metadata to it.

rosa_data <- read_rds("./data/101-RoSA-klamath-estuary-samples.rds")

estuary_meta <- read_csv("./data/102-Klamath-entryDATA_withNMFSID.csv")

## Warning: Missing column names filled in: 'X16' [16]

rosa_meta <- left_join(estuary_meta, rosa_data, by = c("NMFS ID" = "NMFS_DNA_ID")) %>%
  dplyr::select(-ID) %>%
  rename(NMFS_DNA_ID = "NMFS ID") %>%
  dplyr::select(NMFS_DNA_ID, rosa, everything())

Remove all samples with ANY missing data. And remove the recombinant genotypes.

stat_data <- rosa_meta %>%
  rename(fatness = drywet) %>%
  filter(rosa %in% c("EEEEEEEE", "HHHHHHHH", "LLLLLLLL")) %>%
  mutate(
    log_est = log(julian),
    year_f = factor(year, levels = c("2009", "2010"))
  )

Let’s say that early-homozygote RoSA increase to 5% greater than late-homozygote RoSA. In this case each E allele increases fatness by 0.025 units (simple additive model). This is the assumption we are making to start. I’ll vary the additive allele effect in a more systematic way later, but I just want to get the code working for right now.

Creating fake data for each genotype: To do this I will subset the stat_data by RoSA genotype and run a simple linear regression of the form NWF~ intercept +b1log_estuary_date + b2sex + b3*year That will give me effect sizes for each RoSA genotype to most closely mimic the real data. Each fake dataset per genotype will be generated with the genotype specific linear regression effect values.

For creating estuary sampling dates I will randomly sample a normal distribution with mean equal to the mean and standard deviation corresponding to the mean and SD of the real data for each genotype.

I’ll create a balanced number of fish per homozygous genotype (n=100)and half that for heterozygotes to match the observed data in 2010. We’ll simulate this in a single year (no need to create a year factor like the analysis with real data)

Calculate the mean and sd for log estuary date and NWF by sex and genotype

stat_data %>%
  group_by(rosa, sex) %>%
  summarise(
    mean_fat = mean(fatness),
    sd_fat = sd(fatness),
    mean_logest = mean(log_est),
    sd_logest = sd(log_est)
  )

Creating fake EE genotype data

stat_data %>%
  filter(rosa == "EEEEEEEE") %>%
  lm(fatness ~ log_est + sex + year_f, data = .)

## 
## Call:
## lm(formula = fatness ~ log_est + sex + year_f, data = .)
## 
## Coefficients:
## (Intercept)      log_est         sexm   year_f2010  
##     0.85635     -0.10850      0.02683      0.03168

pop_size <- 100
int <- 0.85635
log_esteffect <- -0.10850
sex_effect <- 0.02683
allele_effect <- 0
sim_early <- tibble(
  sex_fake = rep(c(1, 0), each = pop_size / 2), # 1 =male
  logest_fake = rnorm(n = pop_size, mean = 5.03, sd = 0.112),
  eps_male = rnorm(n = pop_size, mean = 0, sd = 0.052),
  eps_female = rnorm(n = pop_size, mean = 0, sd = 0.051),
  fat_est = int + log_esteffect * logest_fake + sex_effect * sex_fake,
  fat_var = ifelse(sex_fake == 1, fat_est + eps_male, fat_est + eps_female),
  fat_fake = fat_var + 2 * allele_effect,
  RoSA = "EE"
)

creating fake data for EL genotype

stat_data %>%
  filter(rosa == "HHHHHHHH") %>%
  lm(fatness ~ log_est + sex + year_f, data = .)

## 
## Call:
## lm(formula = fatness ~ log_est + sex + year_f, data = .)
## 
## Coefficients:
## (Intercept)      log_est         sexm   year_f2010  
##    0.003811     0.055063     0.044197     0.034599

pop_size <- 100
int <- 0.003811
log_esteffect <- 0.055063
sex_effect <- 0.044197


sim_het <- tibble(
  sex_fake = rep(c(1, 0), each = pop_size / 4), # 1 =male
  logest_fake = rnorm(n = pop_size / 2, mean = 5.27, sd = 0.079),
  eps_male = rnorm(n = pop_size / 2, mean = 0, sd = 0.049),
  eps_female = rnorm(n = pop_size / 2, mean = 0, sd = 0.038),
  fat_est = int + log_esteffect * logest_fake + sex_effect * sex_fake,
  fat_var = ifelse(sex_fake == 1, fat_est + eps_male, fat_est + eps_female),
  fat_fake = fat_var + allele_effect,
  RoSA = "EL"
)

creating fake data for LL genotype

stat_data %>%
  filter(rosa == "LLLLLLLL") %>%
  lm(fatness ~ log_est + sex + year_f, data = .)

## 
## Call:
## lm(formula = fatness ~ log_est + sex + year_f, data = .)
## 
## Coefficients:
## (Intercept)      log_est         sexm   year_f2010  
##     1.17710     -0.16408      0.03727      0.02529

int <- 1.1771
log_esteffect <- -0.16408
sex_effect <- 0.03727


sim_late <- tibble(
  sex_fake = rep(c(1, 0), each = pop_size / 2), # 1 =male
  logest_fake = rnorm(n = pop_size, mean = 5.54, sd = 0.090),
  eps_male = rnorm(n = pop_size, mean = 0, sd = 0.055),
  eps_female = rnorm(n = pop_size, mean = 0, sd = 0.036),
  fat_est = int + log_esteffect * logest_fake + sex_effect * sex_fake,
  fat_var = ifelse(sex_fake == 1, fat_est + eps_male, fat_est + eps_female),
  fat_fake = fat_var + 2 * allele_effect,
  RoSA = "LL"
)

Combine the three fake data sets and plot it

sim_data <- bind_rows(sim_early, sim_het, sim_late) %>%
  mutate(RoSA = factor(RoSA, levels = c("EE", "EL", "LL")))

ggplot(sim_data, aes(x = logest_fake, y = fat_fake, colour = RoSA)) +
  geom_point() +
  facet_grid(sex_fake ~ .)

likelihood ratio test. is there evidence for a significant difference in intercept of fat level among the 3 genotypes?

library(nlme)

power_M0 <- gls(fat_fake ~ 1 + logest_fake + sex_fake, data = sim_data) # remove the year factor, simulate a single year of data.

power_mm0 <- lme(fat_fake ~ 1 + logest_fake + sex_fake, random = ~ 1 | RoSA, method = "REML", data = sim_data) # random intercept model

anova(power_M0, power_mm0)$"p-value"[2]

## [1] 0.2074184

Not even close.

ggplot(sim_data, aes(x = RoSA, y = fat_fake)) +
  geom_boxplot()

Make a function

Combine the above code into a function so we can use run a simulation analyses 1,000 or 10,000 times.

fat_power <- function(pop_size, allele_effect) {
  pop_size <- 100
  allele_effect <- allele_effect

  # EE specific effect sizes from linear regression
  int <- 0.85635
  log_esteffect <- -0.10850
  sex_effect <- 0.02683
  sim_early <- tibble(
    sex_fake = rep(c(1, 0), each = pop_size / 2), # 1 =male
    logest_fake = rnorm(n = pop_size, mean = 5.03, sd = 0.112),
    eps_male = rnorm(n = pop_size, mean = 0, sd = 0.052),
    eps_female = rnorm(n = pop_size, mean = 0, sd = 0.051),
    fat_est = int + log_esteffect * logest_fake + sex_effect * sex_fake,
    fat_var = ifelse(sex_fake == 1, fat_est + eps_male, fat_est + eps_female),
    fat_fake = fat_var + 2 * allele_effect,
    RoSA = "EE"
  )

  # EL specific effect sizes from linear regression
  int <- 0.003811
  log_esteffect <- 0.055063
  sex_effect <- 0.044197


  sim_het <- tibble(
    sex_fake = rep(c(1, 0), each = pop_size / 4), # 1 =male
    logest_fake = rnorm(n = pop_size / 2, mean = 5.27, sd = 0.079),
    eps_male = rnorm(n = pop_size / 2, mean = 0, sd = 0.049),
    eps_female = rnorm(n = pop_size / 2, mean = 0, sd = 0.038),
    fat_est = int + log_esteffect * logest_fake + sex_effect * sex_fake,
    fat_var = ifelse(sex_fake == 1, fat_est + eps_male, fat_est + eps_female),
    fat_fake = fat_var + allele_effect,
    RoSA = "EL"
  )

  # LL specific effect sizes from linear regression
  int <- 1.1771
  log_esteffect <- -0.16408
  sex_effect <- 0.03727

  sim_late <- tibble(
    sex_fake = rep(c(1, 0), each = pop_size / 2), # 1 =male
    logest_fake = rnorm(n = pop_size, mean = 5.54, sd = 0.090),
    eps_male = rnorm(n = pop_size, mean = 0, sd = 0.055),
    eps_female = rnorm(n = pop_size, mean = 0, sd = 0.036),
    fat_est = int + log_esteffect * logest_fake + sex_effect * sex_fake,
    fat_fake = ifelse(sex_fake == 1, fat_est + eps_male, fat_est + eps_female),
    RoSA = "LL"
  )

  sim_data <- bind_rows(sim_early, sim_het, sim_late) %>%
    mutate(RoSA = factor(RoSA, levels = c("EE", "EL", "LL")))

  power_M0 <- gls(fat_fake ~ 1 + logest_fake + sex_fake, data = sim_data) # remove the year factor, simulate a single year of data.

  power_mm0 <- lme(fat_fake ~ 1 + logest_fake + sex_fake, random = ~ 1 | RoSA, method = "REML", data = sim_data) # random intercept model

  p_val <- anova(power_M0, power_mm0)$"p-value"[2]

  p_val
}

rep_outp <- replicate(1e3, fat_power(allele_effect = 0.025, pop_size = 100))

table(rep_outp <= 0.05)

## 
## FALSE  TRUE 
##   780   220

Well, that isn’t even close to the 80% power we’d like to see.

What effect size would 80% power be achieved?

alpha <- 0.05
power_table <- tibble(
  allele_effect = seq(0.025, 0.05, by = 0.005)
) %>%
  mutate(power = map_dbl(allele_effect, function(allele_effect) {
    ps <- replicate(1e3, fat_power(allele_effect = allele_effect, pop_size = 100))
    mean(ps < alpha)
  }))

ggplot(power_table, aes(allele_effect, power)) +
  geom_smooth() +
  geom_point() +
  geom_hline(yintercept = 0.8)

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : Chernobyl! trL>n 6

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : Chernobyl! trL>n 6

## Warning in sqrt(sum.squares/one.delta): NaNs produced

## Warning in stats::qt(level/2 + 0.5, pred$df): NaNs produced

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf

ggsave("./outputs/104.1/allele_effect_fatness_power_simulation.pdf")

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : Chernobyl! trL>n 6

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : Chernobyl! trL>n 6

## Warning in sqrt(sum.squares/one.delta): NaNs produced

## Warning in stats::qt(level/2 + 0.5, pred$df): NaNs produced

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf

To detect a significant random intercept the difference in fatness among homozygotes would need to be about 8.5%. The early allele effect would be +4.25% fatness

Create simulated data sets for plotting

We want to make plots of of simulated data sets in which RoSA genotype has a significant effect on NWF, so we can see what the looks like visually.

We simulate data sets using an allele effect that gives us 80% power. We simulate 20 data sets (with 80% power than we are certain that at least 4 of those will yield significant effects of RoSA on fatness) and then plot the first four that are significant.

Here is a function to make simulated data.

fat_fake_data <- function(allele_effect, pop_size) {
  pop_size <- 100
  allele_effect <- allele_effect

  # EE specific effect sizes from linear regression
  int <- 0.85635
  log_esteffect <- -0.10850
  sex_effect <- 0.02683
  sim_early <- tibble(
    sex_fake = rep(c(1, 0), each = pop_size / 2), # 1 =male
    logest_fake = rnorm(n = pop_size, mean = 5.03, sd = 0.112),
    eps_male = rnorm(n = pop_size, mean = 0, sd = 0.052),
    eps_female = rnorm(n = pop_size, mean = 0, sd = 0.051),
    fat_est = int + log_esteffect * logest_fake + sex_effect * sex_fake,
    fat_var = ifelse(sex_fake == 1, fat_est + eps_male, fat_est + eps_female),
    fat_fake = fat_var + 2 * allele_effect,
    RoSA = "EE"
  )

  # EL specific effect sizes from linear regression
  int <- 0.003811
  log_esteffect <- 0.055063
  sex_effect <- 0.044197


  sim_het <- tibble(
    sex_fake = rep(c(1, 0), each = pop_size / 4), # 1 =male
    logest_fake = rnorm(n = pop_size / 2, mean = 5.27, sd = 0.079),
    eps_male = rnorm(n = pop_size / 2, mean = 0, sd = 0.049),
    eps_female = rnorm(n = pop_size / 2, mean = 0, sd = 0.038),
    fat_est = int + log_esteffect * logest_fake + sex_effect * sex_fake,
    fat_var = ifelse(sex_fake == 1, fat_est + eps_male, fat_est + eps_female),
    fat_fake = fat_var + allele_effect,
    RoSA = "EL"
  )


  # LL specific effect sizes from linear regression
  int <- 1.1771
  log_esteffect <- -0.16408
  sex_effect <- 0.03727

  sim_late <- tibble(
    sex_fake = rep(c(1, 0), each = pop_size / 2), # 1 =male
    logest_fake = rnorm(n = pop_size, mean = 5.54, sd = 0.090),
    eps_male = rnorm(n = pop_size, mean = 0, sd = 0.055),
    eps_female = rnorm(n = pop_size, mean = 0, sd = 0.036),
    fat_est = int + log_esteffect * logest_fake + sex_effect * sex_fake,
    fat_fake = ifelse(sex_fake == 1, fat_est + eps_male, fat_est + eps_female),
    RoSA = "LL"
  )

  sim_data <- bind_rows(sim_early, sim_het, sim_late) %>%
    mutate(RoSA = factor(RoSA, levels = c("EE", "EL", "LL")))
  sim_data
}

To make a plot similar to the real data, we want a labeller function.

sex_names <- list(
  "0" = "Female",
  "1" = "Male"
)

plot_labeller <- function(variable, value) {
  return(sex_names[value])
}

And we also want a function that returns a ggplot

ggplot_fake_fat_data <- function(dat2plot) {
  ggplot(dat2plot, aes(x = exp(logest_fake), y = fat_fake, colour = RoSA)) +
    geom_point() +
    scale_y_continuous(name = "Simulated Fatness", limits = c(0.15, .6)) +
    scale_x_continuous(
      name = "Estuary sampling date", limits = c(121, 305), breaks = c(121, 152, 182, 213, 244, 274),
      labels = c("May-01", "June-01", "July-01", "Aug-01", "Sept-01", "Oct-01")
    ) +
    theme_bw() +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
      strip.background = element_blank()
    ) +
    facet_grid(. ~ factor(sex_fake), labeller = plot_labeller)
}

And we also want a function that analyzes the fat fake data

power_M1_func <- function(dat2plot) {
  power_M1 <- gls(fat_fake ~ 1 + logest_fake + sex_fake, data = dat2plot) # remove the year factor, simulate a single year of data.
}

power_mm1_func <- function(dat2plot) {
  power_mm1 <- lme(fat_fake ~ 1 + logest_fake + sex_fake, random = ~ 1 | RoSA, method = "REML", data = dat2plot) # random intercept model
}

And now we can make a tibble that stores 20 simulated data sets and the outputs of their analyses. We will use some list columns:

set.seed(5) # for reproducibility
sim_tibble <- tibble(
  data_sets = lapply(1:10, function(x) fat_fake_data(allele_effect = 0.0425, pop_size = 100))
) %>%
  mutate(
    plots = map(data_sets, ggplot_fake_fat_data),
    power_M1 = map(data_sets, power_M1_func),
    power_mm1 = map(data_sets, power_mm1_func),
    anova = map2(.x = power_M1, .y = power_mm1, .f = anova),
    ranef = map(power_mm1, ranef),
    signif = map_dbl(anova, ~ .$`p-value`[2])
  )

Now, take the first four that have a p-value < 0.05:

signif_4 <- sim_tibble %>%
  filter(signif < 0.05) %>%
  slice(1:4)

Let’s look at the intercepts of those:

signif_4$ranef

## [[1]]
##    (Intercept)
## EE  0.03452647
## EL  0.01149402
## LL -0.04602049
## 
## [[2]]
##    (Intercept)
## EE  0.02983694
## EL  0.01245745
## LL -0.04229439
## 
## [[3]]
##     (Intercept)
## EE  0.036790405
## EL  0.008765027
## LL -0.045555432
## 
## [[4]]
##    (Intercept)
## EE  0.02648907
## EL -0.00243971
## LL -0.02404936

And the significances:

signif_4$signif

## [1] 2.047561e-05 1.080046e-04 4.427026e-05 1.450386e-02

Now we can plot the 4 simulated data sets.

plot_grid(signif_4$plots[[1]],
  signif_4$plots[[2]],
  signif_4$plots[[3]],
  signif_4$plots[[4]],
  ncol = 2
)

## Warning: Removed 3 rows containing missing values (geom_point).

## Warning: Removed 7 rows containing missing values (geom_point).

## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 6 rows containing missing values (geom_point).

ggsave("./outputs/104.1/simulated_fatness_sampling_date_faceted.pdf", width = 11)

This plot graphically represents the difference in NWF among EE, EL and LL genotypes that would need to be present to find a statistically significant random effect at 80% power.

Running Time

Running the code and rendering this notebook required approximately this much time on a Mac laptop of middling speed:

Sys.time() - start_time

## Time difference of 2.161547 mins

104.1-klamath-estuary-nonwaterfraction-power-analysis

Neil Thompson

Last Updated: 2020-05-14