library(tidyverse)
library(rubias)

dir.create("outputs/101", recursive = TRUE, showWarnings = FALSE)

1. Load GSI baseline data frame.

Baseline final has samples with less than 15% missing data and there are no duplicates.

baseline_final <- readRDS("./data/101-GSI-rubias-baseline-data.rds")
baseline_final %>%
  count(collection, repunit) %>%
  arrange(repunit)

Some of those collection counts are pretty dang small. I’m going to lump all Salmon River collections into a single collection. The rest of the collections look good to me.

baseline_final <- baseline_final %>%
  mutate(collection = ifelse(collection %in% c("SouthForkSalmonRiver", "SalmonSp", "SalmonRiver", "SalmonR", "SalmonFall", "EastForkSouthForkSalmonRiver"), "SalmonRiver", collection))

baseline_final %>%
  count(collection, repunit) %>%
  arrange(repunit)

For this analysis the baseline data will have the following groups:

  1. Trinity River (Trinity River hatchery spring and Trinity River hatchery fall)
  2. Klamath River (Salmon River [includes Salmon River fall, Salmon River spring, Salmon River samples], Scott River fall, Shasta River fall), Iron Gate hatchery)
  3. Blue River (this is part of the Southern Oregon and Northern California Coastal ESU even though it is a lower Klamath River tributary)
  4. Coastal California (Russian and Eel rivers)

2. Load the mixture data frame of the klamath at-entry samples.

mixture_final <- read_rds("./data/101-GSI-rubias-mixture-data.rds")

3. Self-assignment evaluation of the baseline

sa_chinook <- self_assign(reference = baseline_final, gen_start_col = 5)
## Summary Statistics:
## 
## 2729 Individuals in Sample
## 
## 124 Loci: tag_id_1030_1, tag_id_1079_1, tag_id_1126_1, tag_id_1144_1, tag_id_115_1, tag_id_1191_1, tag_id_120_1, tag_id_1227_1, tag_id_1243_1, tag_id_1276_1, tag_id_1281_1, tag_id_1363_1, tag_id_1413_1, tag_id_1425_1, tag_id_1428_1, tag_id_1470_1, tag_id_1551_1, tag_id_1554_1, tag_id_1629_1, tag_id_1692_1, tag_id_1733_1, tag_id_186_1, tag_id_1872_1, tag_id_2_1016_1, tag_id_2_1029_1, tag_id_2_1114_1, tag_id_2_113_1, tag_id_2_1158_1, tag_id_2_123_1, tag_id_2_1268_1, tag_id_2_1348_1, tag_id_2_136_1, tag_id_2_1382_1, tag_id_2_1539_1, tag_id_2_1579_1, tag_id_2_1586_1, tag_id_2_1693_1, tag_id_2_188_1, tag_id_2_1887_1, tag_id_2_20_1, tag_id_2_206_1, tag_id_2_2222_1, tag_id_2_234_1, tag_id_2_251_1, tag_id_2_2632_1, tag_id_2_2741_1, tag_id_2_2787_1, tag_id_2_284_1, tag_id_2_2973_1, tag_id_2_3026_1, tag_id_2_3094_1, tag_id_2_311_1, tag_id_2_3182_1, tag_id_2_321_1, tag_id_2_332_1, tag_id_2_3452_1, tag_id_2_3471_1, tag_id_2_3577_1, tag_id_2_40_1, tag_id_2_414_1, tag_id_2_419_1, tag_id_2_487_1, tag_id_2_502_1, tag_id_2_58_1, tag_id_2_633_1, tag_id_2_661_1, tag_id_2_694_1, tag_id_2_700_1, tag_id_2_705_1, tag_id_2_749_1, tag_id_2_786_1, tag_id_2_855_1, tag_id_2_859_1, tag_id_2_9_1, tag_id_2_911_1, tag_id_2_935_1, tag_id_2_939_1, tag_id_2_953_1, tag_id_2_978_1, tag_id_2_98_1, tag_id_235_1, tag_id_251_1, tag_id_275_1, tag_id_278_1, tag_id_282_1, tag_id_3194_1, tag_id_32_1, tag_id_3221_1, tag_id_381_1, tag_id_384_1, tag_id_3920_1, tag_id_423_1, tag_id_425_1, tag_id_427_1, tag_id_430_1, tag_id_481_1, tag_id_491_1, tag_id_4969_1, tag_id_5385_1, tag_id_542_1, tag_id_554_1, tag_id_5617_1, tag_id_5686_1, tag_id_5720_1, tag_id_600_1, tag_id_603_1, tag_id_650_1, tag_id_664_1, tag_id_669_1, tag_id_684_1, tag_id_695_1, tag_id_70_1, tag_id_716_1, tag_id_744_1, tag_id_757_1, tag_id_773_1, tag_id_784_1, tag_id_787_1, tag_id_819_1, tag_id_826_1, tag_id_871_1, tag_id_945_1, tag_id_968_1, tag_id_999_1
## 
## 4 Reporting Units: BlueCreek, CoastalCA, TrinityRiver, UpperKlamath
## 
## 10 Collections: BlueFall, EelVAfl, RussianWSH, TRHsibs, TrinityFall, TrinitySp, SalmonRiver, IGH, ScottFall, ShastaFall
## 
## 1.80% of allelic data identified as missing
sa_to_repu <- sa_chinook %>%
  group_by(indiv, collection, repunit, inferred_repunit) %>%
  summarise(repu_scaled_like = sum(scaled_likelihood))

sa_to_repu %>%
  group_by(indiv) %>%
  top_n(1, wt = repu_scaled_like) %>%
  mutate(same_pop = ifelse(repunit == inferred_repunit, "yes", "no")) %>%
  ungroup() %>%
  count(same_pop)

The baseline has a 96.3 correct assignment percentage to original reporting unit.

Which baseline samples are assigned to an “incorrect” population?

sa_to_repu %>%
  group_by(indiv) %>%
  top_n(1, wt = repu_scaled_like) %>%
  mutate(same_pop = ifelse(repunit == inferred_repunit, "yes", "no")) %>%
  ungroup() %>%
  filter(same_pop == "no") %>%
  mutate(tmp = paste0(repunit, "_", inferred_repunit)) %>%
  count(tmp) %>%
  separate(tmp, into = c("repunit", "inferred_repunit"))

Calculate repunit self-assignment accuracy

sa_to_repu %>%
  group_by(indiv) %>%
  top_n(1, wt = repu_scaled_like) %>%
  mutate(same_pop = ifelse(repunit == inferred_repunit, "yes", "no")) %>%
  ungroup() %>%
  group_by(repunit) %>%
  count(same_pop) %>%
  mutate(tot_fish = sum(n)) %>%
  filter(same_pop == "yes") %>%
  mutate(assign_rate = n / tot_fish) %>%
  rename(n_same = n)
  1. Perform a mixture analysis using the boostrap correction reporting unit proportions RUBIAS settings from https://github.com/eriqande/rubias/blob/master/vignettes/rubias-overview.Rmd
mix_est_pb <- infer_mixture(
  reference = baseline_final,
  mixture = mixture_final,
  gen_start_col = 5,
  method = "PB"
)
## Warning: Factor `repunit` contains implicit NA, consider using
## `forcats::fct_explicit_na`

Reporting mixture proportion estimates for each baseline population

rep_mix_ests <- mix_est_pb$mixing_proportions %>%
  group_by(mixture_collection, repunit) %>%
  summarise(repprop = round(sum(pi), 3))

rep_mix_ests %>%
  arrange(desc(repprop))

Calculate individuals posteriors

rep_indiv_ests <- mix_est_pb$indiv_posteriors %>%
  group_by(mixture_collection, indiv, repunit) %>%
  summarise(rep_pofz = sum(PofZ))

rep_indiv_ests %>%
  group_by(indiv) %>%
  arrange(desc(rep_pofz)) %>%
  slice(1) %>%
  ungroup()

plot the mixture estimates from each group.

nsweeps <- max(mix_est_pb$mix_prop_traces$sweep)

trace_subset <- mix_est_pb$mix_prop_traces %>%
  filter(sweep > 200) %>%
  group_by(sweep, repunit) %>%
  summarise(repprop = sum(pi))

ggplot(trace_subset, aes(x = repprop, colour = repunit)) +
  geom_density(size = 2) +
  ylim(0, 50) +
  theme_bw()

Lets see if any of the fish look like non Klamath River basin fish, blue line should mirror the black line.

# get the maximum-a-posteriori population for each individual
map_rows <- mix_est_pb$indiv_posteriors %>%
  group_by(indiv) %>%
  top_n(1, PofZ) %>%
  ungroup()

normo <- tibble(z_score = rnorm(1e06))

ggplot(map_rows, aes(x = z_score)) +
  geom_density(colour = "blue") +
  geom_density(data = normo, colour = "black")

Computing credible intervals

top_cis <- trace_subset %>%
  group_by(repunit) %>%
  summarise(
    pt_est = round(mean(repprop), 3),
    loCI = round(quantile(repprop, probs = 0.025), 3),
    hiCI = round(quantile(repprop, probs = 0.975), 3)
  )
top_cis

There is 1 fish that didn’t assign to a Klamath Basin reporting unit. Let’s see what RoSA genotype that fish has.

kl_rosa <- read_rds("./data/101-RoSA-klamath-estuary-samples.rds")
kl_rosa %>% filter(Indiv == "CH11459")

Having a homozygous late genotype in a fish assigned to the CoastalCA repunit isn’t surprising. It makes sense that the fish is homozygous late as only fall-run ecotype fish are present in the CoastalCA repunit.

Write an RDS to use in later analyses.

dir.create("./outputs")

estuary_assignments <- rep_indiv_ests %>%
  group_by(indiv) %>%
  arrange(desc(rep_pofz)) %>%
  slice(1) %>%
  ungroup()

write_rds(estuary_assignments, "./outputs/101/RoSA-klamath-estuary-rubias-assignments.rds", compress = "xz")

Session Info

sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value                       
##  version  R version 3.6.2 (2019-12-12)
##  os       macOS Sierra 10.12.6        
##  system   x86_64, darwin15.6.0        
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       America/Denver              
##  date     2020-05-14                  
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package      * version   date       lib source        
##  assertthat     0.2.1     2019-03-21 [1] CRAN (R 3.6.0)
##  backports      1.1.6     2020-04-05 [1] CRAN (R 3.6.2)
##  broom          0.5.6     2020-04-20 [1] CRAN (R 3.6.2)
##  cellranger     1.1.0     2016-07-27 [1] CRAN (R 3.6.0)
##  cli            2.0.2     2020-02-28 [1] CRAN (R 3.6.0)
##  colorspace     1.4-1     2019-03-18 [1] CRAN (R 3.6.0)
##  crayon         1.3.4     2017-09-16 [1] CRAN (R 3.6.0)
##  DBI            1.1.0     2019-12-15 [1] CRAN (R 3.6.0)
##  dbplyr         1.4.3     2020-04-19 [1] CRAN (R 3.6.2)
##  digest         0.6.25    2020-02-23 [1] CRAN (R 3.6.0)
##  dplyr        * 0.8.5     2020-03-07 [1] CRAN (R 3.6.0)
##  ellipsis       0.3.0     2019-09-20 [1] CRAN (R 3.6.0)
##  evaluate       0.14      2019-05-28 [1] CRAN (R 3.6.0)
##  fansi          0.4.1     2020-01-08 [1] CRAN (R 3.6.0)
##  farver         2.0.3     2020-01-16 [1] CRAN (R 3.6.0)
##  forcats      * 0.5.0     2020-03-01 [1] CRAN (R 3.6.0)
##  fs             1.4.1     2020-04-04 [1] CRAN (R 3.6.2)
##  generics       0.0.2     2018-11-29 [1] CRAN (R 3.6.0)
##  ggplot2      * 3.3.0     2020-03-05 [1] CRAN (R 3.6.0)
##  glue           1.4.0     2020-04-03 [1] CRAN (R 3.6.2)
##  gtable         0.3.0     2019-03-25 [1] CRAN (R 3.6.0)
##  haven          2.2.0     2019-11-08 [1] CRAN (R 3.6.0)
##  hms            0.5.3     2020-01-08 [1] CRAN (R 3.6.0)
##  htmltools      0.4.0     2019-10-04 [1] CRAN (R 3.6.0)
##  httr           1.4.1     2019-08-05 [1] CRAN (R 3.6.0)
##  jsonlite       1.6.1     2020-02-02 [1] CRAN (R 3.6.0)
##  knitr          1.28      2020-02-06 [1] CRAN (R 3.6.0)
##  labeling       0.3       2014-08-23 [1] CRAN (R 3.6.0)
##  lattice        0.20-38   2018-11-04 [2] CRAN (R 3.6.2)
##  lifecycle      0.2.0     2020-03-06 [1] CRAN (R 3.6.0)
##  lubridate      1.7.8     2020-04-06 [1] CRAN (R 3.6.2)
##  magrittr       1.5       2014-11-22 [1] CRAN (R 3.6.0)
##  modelr         0.1.6     2020-02-22 [1] CRAN (R 3.6.0)
##  munsell        0.5.0     2018-06-12 [1] CRAN (R 3.6.0)
##  nlme           3.1-142   2019-11-07 [2] CRAN (R 3.6.2)
##  pillar         1.4.3     2019-12-20 [1] CRAN (R 3.6.0)
##  pkgconfig      2.0.3     2019-09-22 [1] CRAN (R 3.6.0)
##  purrr        * 0.3.4     2020-04-17 [1] CRAN (R 3.6.2)
##  R6             2.4.1     2019-11-12 [1] CRAN (R 3.6.0)
##  Rcpp           1.0.4     2020-03-17 [1] CRAN (R 3.6.0)
##  RcppParallel   5.0.0     2020-03-11 [1] CRAN (R 3.6.0)
##  readr        * 1.3.1     2018-12-21 [1] CRAN (R 3.6.0)
##  readxl         1.3.1     2019-03-13 [1] CRAN (R 3.6.0)
##  reprex         0.3.0     2019-05-16 [1] CRAN (R 3.6.0)
##  rlang          0.4.5     2020-03-01 [1] CRAN (R 3.6.0)
##  rmarkdown      2.1       2020-01-20 [1] CRAN (R 3.6.0)
##  rstudioapi     0.11      2020-02-07 [1] CRAN (R 3.6.0)
##  rubias       * 0.3.1.900 2020-04-01 [1] local         
##  rvest          0.3.5     2019-11-08 [1] CRAN (R 3.6.0)
##  scales         1.1.0     2019-11-18 [1] CRAN (R 3.6.0)
##  sessioninfo    1.1.1     2018-11-05 [1] CRAN (R 3.6.0)
##  stringi        1.4.6     2020-02-17 [1] CRAN (R 3.6.0)
##  stringr      * 1.4.0     2019-02-10 [1] CRAN (R 3.6.0)
##  tibble       * 3.0.1     2020-04-20 [1] CRAN (R 3.6.2)
##  tidyr        * 1.0.2     2020-01-24 [1] CRAN (R 3.6.0)
##  tidyselect     1.0.0     2020-01-27 [1] CRAN (R 3.6.0)
##  tidyverse    * 1.3.0     2019-11-21 [1] CRAN (R 3.6.0)
##  vctrs          0.2.4     2020-03-10 [1] CRAN (R 3.6.0)
##  withr          2.2.0     2020-04-20 [1] CRAN (R 3.6.2)
##  xfun           0.13      2020-04-13 [1] CRAN (R 3.6.2)
##  xml2           1.3.2     2020-04-23 [1] CRAN (R 3.6.2)
##  yaml           2.2.1     2020-02-01 [1] CRAN (R 3.6.0)
## 
## [1] /Users/eriq/Library/R/3.6/library
## [2] /Library/Frameworks/R.framework/Versions/3.6/Resources/library

Running Time

Running the code and rendering this notebook required approximately this much time on a Mac laptop of middling speed:

Sys.time() - start_time
## Time difference of 20.07908 secs