ROSA analysis of klamath estuary fish with gonadosomatic index metric. Conducting a model selection approach to evaluate the influence of RoSA genotype on gonadosomatic index.

library(tidyverse)
library(nlme)

Load RoSA genotype data and join it to the estuary metadata which includes Gonadosomatic Index data (Hearsey 2011).

rosa_data <- read_rds("./data/101-RoSA-klamath-estuary-samples.rds")
estuary_meta <- read_csv("./data/102-Klamath-entryDATA_withNMFSID.csv")
## Warning: Missing column names filled in: 'X16' [16]
rosa_meta <- left_join(estuary_meta, rosa_data, by = c("NMFS ID" = "NMFS_DNA_ID")) %>%
  dplyr::select(-ID) %>%
  rename(NMFS_DNA_ID = "NMFS ID") %>%
  dplyr::select(NMFS_DNA_ID, rosa, everything())

Remove all samples with ANY missing data. And the recombinant genotypes.

rosa_stats <- rosa_meta %>%
  filter(
    !str_detect(rosa, "\\?"),
    rosa %in% c("EEEEEEEE", "HHHHHHHH", "LLLLLLLL")
  )

There are 502 at-entry estuary samples with 100% complete genotypes. There are 515 Klamath at-entry estuary site samples.

Visualize the relationship between RoSA and gonadosomatic index.

rosa_stats %>%
  mutate(rosa_f = factor(rosa, levels = c("EEEEEEEE", "HHHHHHHH", "LLLLLLLL"))) %>%
  ggplot(., aes(x = julian, y = gsi, fill = rosa_f, colour = rosa_f)) +
  geom_point() +
  xlab(label = "Estuary sampling date") +
  ylab(label = "Gonadosomatic Index") +
  scale_fill_discrete(name = "Genotype") +
  scale_colour_discrete(name = "Genotype") +
  theme_bw() +
  scale_x_continuous(breaks = c(121, 152, 182, 213, 244, 274, 305), labels = c(
    "May-01", "June-01",
    "July-01", "Aug-01", "Sept-01", "Oct-01", "Nov-01"
  )) +
  theme(
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 14), axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  guides(
    fill = FALSE,
    colour = FALSE
  ) +
  facet_grid(year ~ .)

rosa_stats %>%
  mutate(rosa_f = factor(rosa, levels = c("EEEEEEEE", "HHHHHHHH", "LLLLLLLL"))) %>%
  ggplot(., aes(x = julian, y = gsi, fill = rosa_f, colour = rosa_f)) +
  geom_point() +
  xlab(label = "Estuary sampling date") +
  ylab(label = "Gonadosomatic Index") +
  scale_fill_discrete(name = "Genotype") +
  scale_colour_discrete(name = "Genotype") +
  theme_bw() +
  scale_x_continuous(breaks = c(121, 152, 182, 213, 244, 274, 305), labels = c(
    "May-01", "June-01",
    "July-01", "Aug-01", "Sept-01", "Oct-01", "Nov-01"
  )) +
  theme(
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 14), axis.text.x = element_text(angle = 45, hjust = 1), strip.background = element_blank()
  ) +
  facet_grid(year ~ sex)

Start mixed effects model analysis to determine if adding RoSA as a random effect predicts Gonadosomatic Index better than just a model with estuary sampling date.

stat_data <- rosa_stats %>%
  mutate(
    sex_f = factor(sex, levels = c("f", "m")),
    year_f = factor(year, levels = c("2009", "2010")),
    rosa_f = factor(rosa, levels = c("EEEEEEEE", "HHHHHHHH", "LLLLLLLL")),
    log_est = log(julian),
    log_est2 = log(julian)^2,
    gonadsi = round(gsi, 4)
  )

exploratory plotting to look at variance among explanatory variables

plot(gonadsi ~ log_est, stat_data)

plot(gonadsi ~ log_est2, stat_data)

plot(gonadsi ~ sex_f, stat_data)

plot(gonadsi ~ year_f, stat_data)

It looks like all of the explanatory variables may have a violation of equal variance. I’ll use the different variance structures to deal with this and see if it’s preferred over a model without the additional variance structures.

It looks like all of the explanatory variables may have a violation of equal variance. I’ll use the different variance structures to deal with this and see if it’s preferred over a model without the additional variance structures.

M0 <- gls(gonadsi ~ 1 + log_est + log_est2 + year_f + sex, method = "REML", data = stat_data)
logestFixed <- varFixed(~log_est)
logest2Fixed <- varFixed(~log_est2)
logest2Power <- varPower(form = ~log_est2)
logest2ConstPower <- varConstPower(form = ~log_est2)
estFixed <- varFixed(~ log_est + log_est2)
sexFixed <- varIdent(~sex_f)
yearFixed <- varIdent(~year_f)
sexyrFixed <- varIdent(~ sex_f + year_f)
combFixed <- varComb(
  varFixed(~ log_est + log_est2),
  varIdent(~ sex_f + year_f)
)
M1 <- gls(gonadsi ~ 1 + log_est + log_est2 + year_f + sex, weights = logestFixed, method = "REML", data = stat_data)
M2 <- gls(gonadsi ~ 1 + log_est + log_est2 + year_f + sex, weights = logest2Fixed, method = "REML", data = stat_data)
M3 <- gls(gonadsi ~ 1 + log_est + log_est2 + year_f + sex, weights = estFixed, method = "REML", data = stat_data)
M4 <- gls(gonadsi ~ 1 + log_est + log_est2 + year_f + sex, weights = sexFixed, method = "REML", data = stat_data)
M5 <- gls(gonadsi ~ 1 + log_est + log_est2 + year_f + sex, weights = yearFixed, method = "REML", data = stat_data)
M6 <- gls(gonadsi ~ 1 + log_est + log_est2 + year_f + sex, weights = sexyrFixed, method = "REML", data = stat_data)
M7 <- gls(gonadsi ~ 1 + log_est + log_est2 + year_f + sex, weights = combFixed, method = "REML", data = stat_data)
M8 <- gls(gonadsi ~ 1 + log_est + log_est2 + year_f + sex, weights = logest2Power, method = "REML", data = stat_data)
M9 <- gls(gonadsi ~ 1 + log_est + log_est2 + year_f + sex, weights = logest2ConstPower, method = "REML", data = stat_data)
AIC(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9)

The model that uses the varPower variance structure on the log_est2 explanatory variable is the preferred model indicated by AIC. The varConstPower is nearly the same as the varPower structure, but those two are highly preferred over the other variance structures.

Model checking of the varPower model (M8)

e1 <- resid(M8, type = "normalized")
plot(e1 ~ log_est2, stat_data)

Awesome, this looks way way better than the original gls model.

plot(e1 ~ sex_f, stat_data)

plot(e1 ~ log_est, stat_data)

plot(e1 ~ year_f, stat_data)

hist(e1, main = "Histogram of residuals") # looks ok

Moving forward with the varPower structure on log_est2 for the model selection exercise

mm1 <- lme(gonadsi ~ 1 + log_est + log_est2 + year_f + sex, random = ~ 1 | rosa_f, method = "REML", weights = logest2Power, data = stat_data) # random intercept model
anova(M8, mm1) # likelihood ratio test to determine if the mixed effects model is preferred over the gls with varPower variance structure.

Adding a random intercept for RoSA does not improve the model. When forcing the 3 RoSA genotypes to have the same slope (same effect of estuary date on gonadosomatic index) there is no support for differing intercepts (i.e. all RoSA genotypes start with the same gonadosomatic index at day 0).

Lets see what happen when a random intercept and slope for RoSA are included. A random slope would indicate that the strength of the relationship between gonadosomatic index and estuary entry date is not the same among RoSA genotypes.

mm2 <- lme(gonadsi ~ 1 + log_est + log_est2 + year_f + sex, random = ~ 1 + log_est | rosa_f, method = "REML", weights = logest2Power, data = stat_data)
anova(M8, mm2)

A random slope and intercept model is not preferred over the non-random model. (Note this does not differ in a different slope for log_est or log_est2 is used). I used both log_est and log_est2 (change code to “random = ~1+log_est2|RoSA”) and both models were not preferred over the gls.

Let’s see if the random slope (without random intercept) for RoSA is the most preferred model

mm3 <- lme(gonadsi ~ 1 + log_est + log_est2 + year_f + sex, random = ~ 0 + log_est | rosa_f, method = "REML", weights = logest2Power, data = stat_data)
anova(M8, mm3)

No support for a random slope model. I used both log_est and log_est2 (change code to “random = ~0+log_est2|rosa_f”) and both models were not preferred over the gls. See code chunks below.

mm2a <- lme(gonadsi ~ 1 + log_est + log_est2 + year_f + sex, random = ~ 1 + log_est2 | rosa_f, method = "REML", weights = logest2Power, data = stat_data)
anova(M8, mm2a)
mm3a <- lme(gonadsi ~ 1 + log_est + log_est2 + year_f + sex, random = ~ 0 + log_est2 | rosa_f, method = "REML", weights = logest2Power, data = stat_data)
anova(M8, mm3a)

The random slope model isn’t preferred over the fixed effects model or the random slope and random intercept model.

Making sense of these results:

Julian Day is highly significant (it appears, need to do the model selection procedure–> see below)

Random intercept for RoSA not supported –> RoSA genotypes don’t start off at different gonadSI values when the slope relationship between estuary sampling date and gonadosomatic index is not allowed to vary for the three RoSA genotypes.

Random slope for RoSA not supported –> The effect size of estuary sampling date on gonadosomatic index is not different among RoSA genotypes when the intercept is constrained to be the same for all 3 genotypes.

Random slope and intercept model is not preferred over the GLS model.

random intercept model coefficients

mm1$coefficients
## $fixed
##   (Intercept)       log_est      log_est2    year_f2010          sexm 
##  1.6044820066 -0.6570168098  0.0679206845 -0.0009777401 -0.0138359664 
## 
## $random
## $random$rosa_f
##            (Intercept)
## EEEEEEEE -8.915519e-20
## HHHHHHHH  2.361010e-19
## LLLLLLLL -1.469235e-19

random slope model coefficients

mm3$coefficients
## $fixed
##   (Intercept)       log_est      log_est2    year_f2010          sexm 
##  1.6044820059 -0.6570168096  0.0679206845 -0.0009777401 -0.0138359664 
## 
## $random
## $random$rosa_f
##                log_est
## EEEEEEEE -1.566720e-20
## HHHHHHHH  4.302565e-20
## LLLLLLLL -2.735887e-20

random intercept and slope model coefficients

mm2$coefficients
## $fixed
##   (Intercept)       log_est      log_est2    year_f2010          sexm 
##  1.6044820057 -0.6570168095  0.0679206845 -0.0009777401 -0.0138359664 
## 
## $random
## $random$rosa_f
##            (Intercept)       log_est
## EEEEEEEE -4.636679e-20 -7.510387e-21
## HHHHHHHH  1.217988e-19  2.082072e-20
## LLLLLLLL -7.536735e-20 -1.332469e-20

Model selection on the fixed effects in the GLS model (M8) and see what predictors significantly influence gonadosomatic index.

M0 <- gls(gonadsi ~ 1 + log_est + log_est2 + year_f + sex, weights = logest2Power, data = stat_data)
M1 <- gls(gonadsi ~ 1 + log_est + log_est2 + sex, weights = logest2Power, data = stat_data) # model without year
M2 <- gls(gonadsi ~ 1 + log_est + log_est2 + year_f, weights = logest2Power, data = stat_data) # model without sex
M3 <- gls(gonadsi ~ 1 + log_est + year_f + sex, weights = logest2Power, data = stat_data) # model without log estuary date^2
M4 <- gls(gonadsi ~ 1 + log_est2 + year_f + sex, weights = logest2Power, data = stat_data) # model without log estuary date
fit0ml <- update(M0, . ~ ., method = "ML")
fit1ml <- update(M1, . ~ ., method = "ML")
fit2ml <- update(M2, . ~ ., method = "ML")
fit3ml <- update(M3, . ~ ., method = "ML")
fit4ml <- update(M4, . ~ ., method = "ML")

Likelihood ratio tests using full GLS model;

gonadsi ~ 1 + log_est + log_est2 + year_f + sex, weights = logest2Power, data = stat_data

LRT on year

anova(fit0ml, fit1ml) # year is not significant

LRT on sex

anova(fit0ml, fit2ml) # sex is significant

LRT on log_est^2

anova(fit0ml, fit3ml) # log estuary date squared is significant

LRT on log_est

anova(fit0ml, fit4ml) # log estuary date is significant
library(sjPlot)
tab_model(M0)
## 'r2()' does not support models of class 'gls'.
  gonadsi
Predictors Estimates CI p
(Intercept) 1.60 1.25 – 1.96 <0.001
log_est -0.66 -0.79 – -0.52 <0.001
log_est2 0.07 0.05 – 0.08 <0.001
year_f [2010] -0.00 -0.00 – 0.00 0.110
sex [m] -0.01 -0.02 – -0.01 <0.001
Observations 502

(The error message here is irrelevant to printing the table we want.)

Remove year from the GLS and redo model selection on remaining fixed effects

M1 <- gls(gonadsi ~ 1 + log_est + log_est2 + sex, weights = logest2Power, data = stat_data) # model without year
M2 <- gls(gonadsi ~ 1 + log_est + log_est2, weights = logest2Power, data = stat_data) # model without sex
M3 <- gls(gonadsi ~ 1 + log_est + sex, weights = logest2Power, data = stat_data) # model without log estuary date^2
M4 <- gls(gonadsi ~ 1 + log_est2 + sex, weights = logest2Power, data = stat_data) # model without log estuary date
fit1ml <- update(M1, . ~ ., method = "ML")
fit2ml <- update(M2, . ~ ., method = "ML")
fit3ml <- update(M3, . ~ ., method = "ML")
fit4ml <- update(M4, . ~ ., method = "ML")

Likelihood ratio tests using reduced GLS model

gonadsi ~ 1 + log_est + log_est2 + sex, weights = logest2Power, data = stat_data

LRT on sex

anova(fit1ml, fit2ml) # sex is significant

LRT on log_est2

anova(fit1ml, fit3ml) # log estuary date squared is significant

LRT on log_est

anova(fit1ml, fit4ml) # log estuary date is significant

In conclusion, sex, log_est and log_est2 significantly influence gonadosomatic index.

tab_model(M1)
## 'r2()' does not support models of class 'gls'.
  gonadsi
Predictors Estimates CI p
(Intercept) 1.65 1.31 – 2.00 <0.001
log_est -0.68 -0.81 – -0.54 <0.001
log_est2 0.07 0.06 – 0.08 <0.001
sex [m] -0.01 -0.01 – -0.01 <0.001
Observations 502

(The error message here is irrelevant to printing the table we want.)

For the effects of RoSA on gonadosomatic index, there is no evidence that a difference in intercept or slope occurs among the RoSA genotypes.

Session Info

sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value                       
##  version  R version 3.6.2 (2019-12-12)
##  os       macOS Sierra 10.12.6        
##  system   x86_64, darwin15.6.0        
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       America/Denver              
##  date     2020-05-14                  
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package      * version  date       lib source        
##  assertthat     0.2.1    2019-03-21 [1] CRAN (R 3.6.0)
##  backports      1.1.6    2020-04-05 [1] CRAN (R 3.6.2)
##  bayestestR     0.6.0    2020-04-20 [1] CRAN (R 3.6.2)
##  boot           1.3-23   2019-07-05 [2] CRAN (R 3.6.2)
##  broom          0.5.6    2020-04-20 [1] CRAN (R 3.6.2)
##  cellranger     1.1.0    2016-07-27 [1] CRAN (R 3.6.0)
##  cli            2.0.2    2020-02-28 [1] CRAN (R 3.6.0)
##  coda           0.19-3   2019-07-05 [1] CRAN (R 3.6.0)
##  colorspace     1.4-1    2019-03-18 [1] CRAN (R 3.6.0)
##  crayon         1.3.4    2017-09-16 [1] CRAN (R 3.6.0)
##  DBI            1.1.0    2019-12-15 [1] CRAN (R 3.6.0)
##  dbplyr         1.4.3    2020-04-19 [1] CRAN (R 3.6.2)
##  digest         0.6.25   2020-02-23 [1] CRAN (R 3.6.0)
##  dplyr        * 0.8.5    2020-03-07 [1] CRAN (R 3.6.0)
##  effectsize     0.3.0    2020-04-11 [1] CRAN (R 3.6.2)
##  ellipsis       0.3.0    2019-09-20 [1] CRAN (R 3.6.0)
##  emmeans        1.4.6    2020-04-19 [1] CRAN (R 3.6.2)
##  estimability   1.3      2018-02-11 [1] CRAN (R 3.6.0)
##  evaluate       0.14     2019-05-28 [1] CRAN (R 3.6.0)
##  fansi          0.4.1    2020-01-08 [1] CRAN (R 3.6.0)
##  farver         2.0.3    2020-01-16 [1] CRAN (R 3.6.0)
##  forcats      * 0.5.0    2020-03-01 [1] CRAN (R 3.6.0)
##  fs             1.4.1    2020-04-04 [1] CRAN (R 3.6.2)
##  generics       0.0.2    2018-11-29 [1] CRAN (R 3.6.0)
##  ggeffects      0.14.3   2020-04-20 [1] CRAN (R 3.6.2)
##  ggplot2      * 3.3.0    2020-03-05 [1] CRAN (R 3.6.0)
##  glue           1.4.0    2020-04-03 [1] CRAN (R 3.6.2)
##  gtable         0.3.0    2019-03-25 [1] CRAN (R 3.6.0)
##  haven          2.2.0    2019-11-08 [1] CRAN (R 3.6.0)
##  hms            0.5.3    2020-01-08 [1] CRAN (R 3.6.0)
##  htmltools      0.4.0    2019-10-04 [1] CRAN (R 3.6.0)
##  httr           1.4.1    2019-08-05 [1] CRAN (R 3.6.0)
##  insight        0.8.3    2020-04-20 [1] CRAN (R 3.6.2)
##  jsonlite       1.6.1    2020-02-02 [1] CRAN (R 3.6.0)
##  knitr          1.28     2020-02-06 [1] CRAN (R 3.6.0)
##  labeling       0.3      2014-08-23 [1] CRAN (R 3.6.0)
##  lattice        0.20-38  2018-11-04 [2] CRAN (R 3.6.2)
##  lifecycle      0.2.0    2020-03-06 [1] CRAN (R 3.6.0)
##  lme4           1.1-23   2020-04-07 [1] CRAN (R 3.6.2)
##  lubridate      1.7.8    2020-04-06 [1] CRAN (R 3.6.2)
##  magrittr       1.5      2014-11-22 [1] CRAN (R 3.6.0)
##  MASS           7.3-51.6 2020-04-26 [1] CRAN (R 3.6.2)
##  Matrix         1.2-18   2019-11-27 [2] CRAN (R 3.6.2)
##  minqa          1.2.4    2014-10-09 [1] CRAN (R 3.6.0)
##  modelr         0.1.6    2020-02-22 [1] CRAN (R 3.6.0)
##  munsell        0.5.0    2018-06-12 [1] CRAN (R 3.6.0)
##  mvtnorm        1.1-0    2020-02-24 [1] CRAN (R 3.6.0)
##  nlme         * 3.1-142  2019-11-07 [2] CRAN (R 3.6.2)
##  nloptr         1.2.2.1  2020-03-11 [1] CRAN (R 3.6.0)
##  parameters     0.6.1    2020-04-08 [1] CRAN (R 3.6.2)
##  performance    0.4.5    2020-03-28 [1] CRAN (R 3.6.2)
##  pillar         1.4.3    2019-12-20 [1] CRAN (R 3.6.0)
##  pkgconfig      2.0.3    2019-09-22 [1] CRAN (R 3.6.0)
##  purrr        * 0.3.4    2020-04-17 [1] CRAN (R 3.6.2)
##  R6             2.4.1    2019-11-12 [1] CRAN (R 3.6.0)
##  Rcpp           1.0.4    2020-03-17 [1] CRAN (R 3.6.0)
##  readr        * 1.3.1    2018-12-21 [1] CRAN (R 3.6.0)
##  readxl         1.3.1    2019-03-13 [1] CRAN (R 3.6.0)
##  reprex         0.3.0    2019-05-16 [1] CRAN (R 3.6.0)
##  rlang          0.4.5    2020-03-01 [1] CRAN (R 3.6.0)
##  rmarkdown      2.1      2020-01-20 [1] CRAN (R 3.6.0)
##  rstudioapi     0.11     2020-02-07 [1] CRAN (R 3.6.0)
##  rvest          0.3.5    2019-11-08 [1] CRAN (R 3.6.0)
##  scales         1.1.0    2019-11-18 [1] CRAN (R 3.6.0)
##  sessioninfo    1.1.1    2018-11-05 [1] CRAN (R 3.6.0)
##  sjlabelled     1.1.3    2020-01-28 [1] CRAN (R 3.6.0)
##  sjmisc         2.8.4    2020-04-03 [1] CRAN (R 3.6.2)
##  sjPlot       * 2.8.3    2020-03-09 [1] CRAN (R 3.6.0)
##  sjstats        0.17.9   2020-02-06 [1] CRAN (R 3.6.0)
##  statmod        1.4.34   2020-02-17 [1] CRAN (R 3.6.0)
##  stringi        1.4.6    2020-02-17 [1] CRAN (R 3.6.0)
##  stringr      * 1.4.0    2019-02-10 [1] CRAN (R 3.6.0)
##  tibble       * 3.0.1    2020-04-20 [1] CRAN (R 3.6.2)
##  tidyr        * 1.0.2    2020-01-24 [1] CRAN (R 3.6.0)
##  tidyselect     1.0.0    2020-01-27 [1] CRAN (R 3.6.0)
##  tidyverse    * 1.3.0    2019-11-21 [1] CRAN (R 3.6.0)
##  vctrs          0.2.4    2020-03-10 [1] CRAN (R 3.6.0)
##  withr          2.2.0    2020-04-20 [1] CRAN (R 3.6.2)
##  xfun           0.13     2020-04-13 [1] CRAN (R 3.6.2)
##  xml2           1.3.2    2020-04-23 [1] CRAN (R 3.6.2)
##  xtable         1.8-4    2019-04-21 [1] CRAN (R 3.6.0)
##  yaml           2.2.1    2020-02-01 [1] CRAN (R 3.6.0)
## 
## [1] /Users/eriq/Library/R/3.6/library
## [2] /Library/Frameworks/R.framework/Versions/3.6/Resources/library

Running Time

Running the code and rendering this notebook required approximately this much time on a Mac laptop of middling speed:

Sys.time() - start_time
## Time difference of 5.976112 secs