library(tidyverse)
library(rubias)
dir.create("outputs/101", recursive = TRUE, showWarnings = FALSE)
Baseline final has samples with less than 15% missing data and there are no duplicates.
baseline_final <- readRDS("./data/101-GSI-rubias-baseline-data.rds")
baseline_final %>%
count(collection, repunit) %>%
arrange(repunit)
Some of those collection counts are pretty dang small. I’m going to lump all Salmon River collections into a single collection. The rest of the collections look good to me.
baseline_final <- baseline_final %>%
mutate(collection = ifelse(collection %in% c("SouthForkSalmonRiver", "SalmonSp", "SalmonRiver", "SalmonR", "SalmonFall", "EastForkSouthForkSalmonRiver"), "SalmonRiver", collection))
baseline_final %>%
count(collection, repunit) %>%
arrange(repunit)
For this analysis the baseline data will have the following groups:
mixture_final <- read_rds("./data/101-GSI-rubias-mixture-data.rds")
sa_chinook <- self_assign(reference = baseline_final, gen_start_col = 5)
## Summary Statistics:
##
## 2729 Individuals in Sample
##
## 124 Loci: tag_id_1030_1, tag_id_1079_1, tag_id_1126_1, tag_id_1144_1, tag_id_115_1, tag_id_1191_1, tag_id_120_1, tag_id_1227_1, tag_id_1243_1, tag_id_1276_1, tag_id_1281_1, tag_id_1363_1, tag_id_1413_1, tag_id_1425_1, tag_id_1428_1, tag_id_1470_1, tag_id_1551_1, tag_id_1554_1, tag_id_1629_1, tag_id_1692_1, tag_id_1733_1, tag_id_186_1, tag_id_1872_1, tag_id_2_1016_1, tag_id_2_1029_1, tag_id_2_1114_1, tag_id_2_113_1, tag_id_2_1158_1, tag_id_2_123_1, tag_id_2_1268_1, tag_id_2_1348_1, tag_id_2_136_1, tag_id_2_1382_1, tag_id_2_1539_1, tag_id_2_1579_1, tag_id_2_1586_1, tag_id_2_1693_1, tag_id_2_188_1, tag_id_2_1887_1, tag_id_2_20_1, tag_id_2_206_1, tag_id_2_2222_1, tag_id_2_234_1, tag_id_2_251_1, tag_id_2_2632_1, tag_id_2_2741_1, tag_id_2_2787_1, tag_id_2_284_1, tag_id_2_2973_1, tag_id_2_3026_1, tag_id_2_3094_1, tag_id_2_311_1, tag_id_2_3182_1, tag_id_2_321_1, tag_id_2_332_1, tag_id_2_3452_1, tag_id_2_3471_1, tag_id_2_3577_1, tag_id_2_40_1, tag_id_2_414_1, tag_id_2_419_1, tag_id_2_487_1, tag_id_2_502_1, tag_id_2_58_1, tag_id_2_633_1, tag_id_2_661_1, tag_id_2_694_1, tag_id_2_700_1, tag_id_2_705_1, tag_id_2_749_1, tag_id_2_786_1, tag_id_2_855_1, tag_id_2_859_1, tag_id_2_9_1, tag_id_2_911_1, tag_id_2_935_1, tag_id_2_939_1, tag_id_2_953_1, tag_id_2_978_1, tag_id_2_98_1, tag_id_235_1, tag_id_251_1, tag_id_275_1, tag_id_278_1, tag_id_282_1, tag_id_3194_1, tag_id_32_1, tag_id_3221_1, tag_id_381_1, tag_id_384_1, tag_id_3920_1, tag_id_423_1, tag_id_425_1, tag_id_427_1, tag_id_430_1, tag_id_481_1, tag_id_491_1, tag_id_4969_1, tag_id_5385_1, tag_id_542_1, tag_id_554_1, tag_id_5617_1, tag_id_5686_1, tag_id_5720_1, tag_id_600_1, tag_id_603_1, tag_id_650_1, tag_id_664_1, tag_id_669_1, tag_id_684_1, tag_id_695_1, tag_id_70_1, tag_id_716_1, tag_id_744_1, tag_id_757_1, tag_id_773_1, tag_id_784_1, tag_id_787_1, tag_id_819_1, tag_id_826_1, tag_id_871_1, tag_id_945_1, tag_id_968_1, tag_id_999_1
##
## 4 Reporting Units: BlueCreek, CoastalCA, TrinityRiver, UpperKlamath
##
## 10 Collections: BlueFall, EelVAfl, RussianWSH, TRHsibs, TrinityFall, TrinitySp, SalmonRiver, IGH, ScottFall, ShastaFall
##
## 1.80% of allelic data identified as missing
sa_to_repu <- sa_chinook %>%
group_by(indiv, collection, repunit, inferred_repunit) %>%
summarise(repu_scaled_like = sum(scaled_likelihood))
sa_to_repu %>%
group_by(indiv) %>%
top_n(1, wt = repu_scaled_like) %>%
mutate(same_pop = ifelse(repunit == inferred_repunit, "yes", "no")) %>%
ungroup() %>%
count(same_pop)
The baseline has a 96.3 correct assignment percentage to original reporting unit.
Which baseline samples are assigned to an “incorrect” population?
sa_to_repu %>%
group_by(indiv) %>%
top_n(1, wt = repu_scaled_like) %>%
mutate(same_pop = ifelse(repunit == inferred_repunit, "yes", "no")) %>%
ungroup() %>%
filter(same_pop == "no") %>%
mutate(tmp = paste0(repunit, "_", inferred_repunit)) %>%
count(tmp) %>%
separate(tmp, into = c("repunit", "inferred_repunit"))
Calculate repunit self-assignment accuracy
sa_to_repu %>%
group_by(indiv) %>%
top_n(1, wt = repu_scaled_like) %>%
mutate(same_pop = ifelse(repunit == inferred_repunit, "yes", "no")) %>%
ungroup() %>%
group_by(repunit) %>%
count(same_pop) %>%
mutate(tot_fish = sum(n)) %>%
filter(same_pop == "yes") %>%
mutate(assign_rate = n / tot_fish) %>%
rename(n_same = n)
mix_est_pb <- infer_mixture(
reference = baseline_final,
mixture = mixture_final,
gen_start_col = 5,
method = "PB"
)
## Warning: Factor `repunit` contains implicit NA, consider using
## `forcats::fct_explicit_na`
Reporting mixture proportion estimates for each baseline population
rep_mix_ests <- mix_est_pb$mixing_proportions %>%
group_by(mixture_collection, repunit) %>%
summarise(repprop = round(sum(pi), 3))
rep_mix_ests %>%
arrange(desc(repprop))
Calculate individuals posteriors
rep_indiv_ests <- mix_est_pb$indiv_posteriors %>%
group_by(mixture_collection, indiv, repunit) %>%
summarise(rep_pofz = sum(PofZ))
rep_indiv_ests %>%
group_by(indiv) %>%
arrange(desc(rep_pofz)) %>%
slice(1) %>%
ungroup()
nsweeps <- max(mix_est_pb$mix_prop_traces$sweep)
trace_subset <- mix_est_pb$mix_prop_traces %>%
filter(sweep > 200) %>%
group_by(sweep, repunit) %>%
summarise(repprop = sum(pi))
ggplot(trace_subset, aes(x = repprop, colour = repunit)) +
geom_density(size = 2) +
ylim(0, 50) +
theme_bw()
Lets see if any of the fish look like non Klamath River basin fish, blue line should mirror the black line.
# get the maximum-a-posteriori population for each individual
map_rows <- mix_est_pb$indiv_posteriors %>%
group_by(indiv) %>%
top_n(1, PofZ) %>%
ungroup()
normo <- tibble(z_score = rnorm(1e06))
ggplot(map_rows, aes(x = z_score)) +
geom_density(colour = "blue") +
geom_density(data = normo, colour = "black")
Computing credible intervals
top_cis <- trace_subset %>%
group_by(repunit) %>%
summarise(
pt_est = round(mean(repprop), 3),
loCI = round(quantile(repprop, probs = 0.025), 3),
hiCI = round(quantile(repprop, probs = 0.975), 3)
)
top_cis
There is 1 fish that didn’t assign to a Klamath Basin reporting unit. Let’s see what RoSA genotype that fish has.
kl_rosa <- read_rds("./data/101-RoSA-klamath-estuary-samples.rds")
kl_rosa %>% filter(Indiv == "CH11459")
Having a homozygous late genotype in a fish assigned to the CoastalCA repunit isn’t surprising. It makes sense that the fish is homozygous late as only fall-run ecotype fish are present in the CoastalCA repunit.
Write an RDS to use in later analyses.
dir.create("./outputs")
estuary_assignments <- rep_indiv_ests %>%
group_by(indiv) %>%
arrange(desc(rep_pofz)) %>%
slice(1) %>%
ungroup()
write_rds(estuary_assignments, "./outputs/101/RoSA-klamath-estuary-rubias-assignments.rds", compress = "xz")
sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 3.6.2 (2019-12-12)
## os macOS Sierra 10.12.6
## system x86_64, darwin15.6.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/Denver
## date 2020-05-14
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date lib source
## assertthat 0.2.1 2019-03-21 [1] CRAN (R 3.6.0)
## backports 1.1.6 2020-04-05 [1] CRAN (R 3.6.2)
## broom 0.5.6 2020-04-20 [1] CRAN (R 3.6.2)
## cellranger 1.1.0 2016-07-27 [1] CRAN (R 3.6.0)
## cli 2.0.2 2020-02-28 [1] CRAN (R 3.6.0)
## colorspace 1.4-1 2019-03-18 [1] CRAN (R 3.6.0)
## crayon 1.3.4 2017-09-16 [1] CRAN (R 3.6.0)
## DBI 1.1.0 2019-12-15 [1] CRAN (R 3.6.0)
## dbplyr 1.4.3 2020-04-19 [1] CRAN (R 3.6.2)
## digest 0.6.25 2020-02-23 [1] CRAN (R 3.6.0)
## dplyr * 0.8.5 2020-03-07 [1] CRAN (R 3.6.0)
## ellipsis 0.3.0 2019-09-20 [1] CRAN (R 3.6.0)
## evaluate 0.14 2019-05-28 [1] CRAN (R 3.6.0)
## fansi 0.4.1 2020-01-08 [1] CRAN (R 3.6.0)
## farver 2.0.3 2020-01-16 [1] CRAN (R 3.6.0)
## forcats * 0.5.0 2020-03-01 [1] CRAN (R 3.6.0)
## fs 1.4.1 2020-04-04 [1] CRAN (R 3.6.2)
## generics 0.0.2 2018-11-29 [1] CRAN (R 3.6.0)
## ggplot2 * 3.3.0 2020-03-05 [1] CRAN (R 3.6.0)
## glue 1.4.0 2020-04-03 [1] CRAN (R 3.6.2)
## gtable 0.3.0 2019-03-25 [1] CRAN (R 3.6.0)
## haven 2.2.0 2019-11-08 [1] CRAN (R 3.6.0)
## hms 0.5.3 2020-01-08 [1] CRAN (R 3.6.0)
## htmltools 0.4.0 2019-10-04 [1] CRAN (R 3.6.0)
## httr 1.4.1 2019-08-05 [1] CRAN (R 3.6.0)
## jsonlite 1.6.1 2020-02-02 [1] CRAN (R 3.6.0)
## knitr 1.28 2020-02-06 [1] CRAN (R 3.6.0)
## labeling 0.3 2014-08-23 [1] CRAN (R 3.6.0)
## lattice 0.20-38 2018-11-04 [2] CRAN (R 3.6.2)
## lifecycle 0.2.0 2020-03-06 [1] CRAN (R 3.6.0)
## lubridate 1.7.8 2020-04-06 [1] CRAN (R 3.6.2)
## magrittr 1.5 2014-11-22 [1] CRAN (R 3.6.0)
## modelr 0.1.6 2020-02-22 [1] CRAN (R 3.6.0)
## munsell 0.5.0 2018-06-12 [1] CRAN (R 3.6.0)
## nlme 3.1-142 2019-11-07 [2] CRAN (R 3.6.2)
## pillar 1.4.3 2019-12-20 [1] CRAN (R 3.6.0)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 3.6.0)
## purrr * 0.3.4 2020-04-17 [1] CRAN (R 3.6.2)
## R6 2.4.1 2019-11-12 [1] CRAN (R 3.6.0)
## Rcpp 1.0.4 2020-03-17 [1] CRAN (R 3.6.0)
## RcppParallel 5.0.0 2020-03-11 [1] CRAN (R 3.6.0)
## readr * 1.3.1 2018-12-21 [1] CRAN (R 3.6.0)
## readxl 1.3.1 2019-03-13 [1] CRAN (R 3.6.0)
## reprex 0.3.0 2019-05-16 [1] CRAN (R 3.6.0)
## rlang 0.4.5 2020-03-01 [1] CRAN (R 3.6.0)
## rmarkdown 2.1 2020-01-20 [1] CRAN (R 3.6.0)
## rstudioapi 0.11 2020-02-07 [1] CRAN (R 3.6.0)
## rubias * 0.3.1.900 2020-04-01 [1] local
## rvest 0.3.5 2019-11-08 [1] CRAN (R 3.6.0)
## scales 1.1.0 2019-11-18 [1] CRAN (R 3.6.0)
## sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 3.6.0)
## stringi 1.4.6 2020-02-17 [1] CRAN (R 3.6.0)
## stringr * 1.4.0 2019-02-10 [1] CRAN (R 3.6.0)
## tibble * 3.0.1 2020-04-20 [1] CRAN (R 3.6.2)
## tidyr * 1.0.2 2020-01-24 [1] CRAN (R 3.6.0)
## tidyselect 1.0.0 2020-01-27 [1] CRAN (R 3.6.0)
## tidyverse * 1.3.0 2019-11-21 [1] CRAN (R 3.6.0)
## vctrs 0.2.4 2020-03-10 [1] CRAN (R 3.6.0)
## withr 2.2.0 2020-04-20 [1] CRAN (R 3.6.2)
## xfun 0.13 2020-04-13 [1] CRAN (R 3.6.2)
## xml2 1.3.2 2020-04-23 [1] CRAN (R 3.6.2)
## yaml 2.2.1 2020-02-01 [1] CRAN (R 3.6.0)
##
## [1] /Users/eriq/Library/R/3.6/library
## [2] /Library/Frameworks/R.framework/Versions/3.6/Resources/library
Running the code and rendering this notebook required approximately this much time on a Mac laptop of middling speed:
Sys.time() - start_time
## Time difference of 20.07908 secs