library(tidyverse)
library(ecaRbioinf)
library(ape)
library(ggtree)
## Warning: package 'ggtree' was built under R version 3.6.3
library(cowplot)
dir.create("outputs/204", recursive = TRUE, showWarnings = FALSE)
Get the data that we will be working with
haps_bi_rosa <- read_rds("outputs/009/haps_bi_rosa.rds")
sqm2_jc <- read_rds("outputs/009/sqm2_jc.rds")
source("R/define_fcolors_all_sf.R")
final_bits <- function(g, size = 2, stroke = 0.2) {
g +
geom_tippoint(aes(fill = Ecotype, shape = `Fish Genotype`, colour = Basin), size = size, stroke = stroke) +
theme(legend.position = "right") +
scale_shape_manual(values = c(25, 21, 22)) +
scale_fill_manual(values = fcolors_all_sf) +
scale_colour_manual(values = fcolors_all_sf) +
guides(
fill = guide_legend(override.aes = list(shape = 21, stroke = 0.1)),
colour = guide_legend(override.aes = list(shape = 21, stroke = 1.1, fill = NA)),
shape = guide_legend(title = "Fish Genotype")
)
}
Now, write a function that processes everything and produces a neighbor-joining tree
make_tree_from_bi_snps <- function(
haps_bi) {
# make a FASTA of the sequences (inserting the variants into them)
tmpfasta <- str_c(tempfile(), ".fa")
dump <- ecaRbioinf::insert_variants_into_sequences(
V = haps_bi,
fasta = "intermediates/009/NC_037124.1_fragment_chinook.fna.gz",
lo = 12260034,
hi = 12289484,
file = tmpfasta
)
# read in the fastas, compute distances, and make a treeio neighbor-joining tree of it
seqs <- ape::read.FASTA(tmpfasta)
seqdist <- ape::dist.dna(seqs)
nj <- ape::nj(seqdist) %>%
treeio::as.treedata()
tree <- treeio::full_join(nj, sqm2_jc, by = "label")
tree
}
tree <- make_tree_from_bi_snps(haps_bi_rosa)
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
final_bits(ggtree(tree))
## Warning: Removed 1 rows containing non-finite values (stat_tree_label).
Get the error rates we will use:
rates <- read_rds("outputs/204/est_rates_for_simulation.rds")
rates
Now we need a function to create those types of changes to the genotypes in the phased haplotypes at those rates. The genotypes are coded via DNA bases, not as 0, 1, 2, but we have the REF and ALT in each case, so we can figure it out.
# first, get those rates into a list
Ra <- list()
Ra[["0"]] <- arrange(rates, simulated_new_genotype) %>%
filter(imputed_genotype == 0) %>%
pull(rate)
Ra[["1"]] <- arrange(rates, simulated_new_genotype) %>%
filter(imputed_genotype == 1) %>%
pull(rate)
Ra[["2"]] <- arrange(rates, simulated_new_genotype) %>%
filter(imputed_genotype == 2) %>%
pull(rate)
sprinkle_changes <- function(D = haps_bi_rosa, rates = rates, scramble_phase = FALSE) {
tmp <- D %>%
select(-anc_vs_derived, -freq, -spring_allele, -freq, -alle2) %>%
pivot_wider(names_from = c(haplo), values_from = c(allele, haplo_name)) %>%
mutate(g012 = case_when(
allele_a != allele_b ~ 1L,
allele_a == allele_b & allele_a == REF ~ 0L,
allele_a == allele_b & allele_a == ALT ~ 2L,
TRUE ~ 999L
)) %>%
mutate(new012 = case_when(
g012 == 0 ~ sample(x = 0:2, size = n(), prob = Ra[["0"]], replace = TRUE),
g012 == 1 ~ sample(x = 0:2, size = n(), prob = Ra[["1"]], replace = TRUE),
g012 == 2 ~ sample(x = 0:2, size = n(), prob = Ra[["2"]], replace = TRUE),
TRUE ~ 999L
))
#' check that is correct to this point. (It looks correct.)
check <- tmp %>%
count(g012, new012) %>%
group_by(g012) %>%
mutate(freq = n / sum(n))
# now, we just need to translate those back to SNP bases in haplotypes.
# Note that newly-minted heterozygotes will be randomized with
# respect to phase
tmp2 <- tmp %>%
mutate(
orig_hap = str_c(allele_a, "|", allele_b),
ra_het = str_c(REF, "|", ALT),
ar_het = str_c(ALT, "|", REF),
scrambled_het = ifelse(runif(n()) < 0.5, ra_het, ar_het),
rr_hom = str_c(REF, "|", REF),
aa_hom = str_c(ALT, "|", ALT)
) %>%
select(-ra_het, -ar_het) %>%
mutate(new_hap = case_when(
g012 == new012 ~ orig_hap,
new012 == 0 ~ rr_hom,
new012 == 1 ~ scrambled_het,
new012 == 2 ~ aa_hom
))
# here we scramble the phase of ALL heterozygous sites, if requested
if (scramble_phase == TRUE) {
tmp2 <- tmp2 %>%
mutate(new_hap = ifelse(new012 == 1, scrambled_het, new_hap))
}
# now, we pivot_longer our data frame back to something we can use to make the trees
tmp2 %>%
select(-(g012:aa_hom)) %>%
separate(new_hap, into = c("new_a", "new_b"), sep = "\\|") %>%
mutate(
allele_a = new_a,
allele_b = new_b
) %>%
select(-new_a, -new_b, -haplo_name_a, -haplo_name_b) %>%
pivot_longer(
cols = allele_a:allele_b,
names_to = "haplo",
values_to = "allele"
) %>%
mutate(haplo = str_replace(haplo, "allele_", "")) %>%
mutate(haplo_name = str_c(Indiv, "-", haplo))
}
new_haps <- sprinkle_changes()
new_tree <- make_tree_from_bi_snps(new_haps)
final_bits(ggtree(new_tree))
## Warning: Removed 1 rows containing non-finite values (stat_tree_label).
Note that looks a lot like the original tree, with just some extra length on the final twigs.
Make the plots.
phase_retained_trees <- list(
OriginalTree = make_tree_from_bi_snps(haps_bi_rosa) %>%
ggtree() %>%
final_bits()
)
set.seed(5)
for (i in 2:6) {
Tag <- str_c("Simulated_Error_Rep_", i - 1)
phase_retained_trees[[Tag]] <- sprinkle_changes() %>%
make_tree_from_bi_snps() %>%
ggtree() %>%
final_bits()
}
Then put them in a grid:
tree_grid <- plot_grid(
plotlist = phase_retained_trees,
labels = names(phase_retained_trees),
ncol = 2
)
## Warning: Removed 1 rows containing non-finite values (stat_tree_label).
## Warning: Removed 1 rows containing non-finite values (stat_tree_label).
## Warning: Removed 1 rows containing non-finite values (stat_tree_label).
## Warning: Removed 1 rows containing non-finite values (stat_tree_label).
## Warning: Removed 1 rows containing non-finite values (stat_tree_label).
## Warning: Removed 1 rows containing non-finite values (stat_tree_label).
# print that to an output PDF that is good sized
ggsave(tree_grid,
filename = "outputs/204/phase-remained-tree-grid.pdf",
width = 14,
height = 20
)
# plot it smooshed up too:
tree_grid
This is a little odd because most of the genotypes are homozygous except for the individuals that are hets for E and L lineage haplotypes. So, those hets are going to be pretty wonky and that might screw things up, but let us see:
phase_scrambled_trees <- list(
OriginalTree = make_tree_from_bi_snps(haps_bi_rosa) %>%
ggtree() %>%
final_bits()
)
set.seed(5)
for (i in 2:6) {
Tag <- str_c("Simulated_Error/PhaseScramble_Rep_", i - 1)
phase_scrambled_trees[[Tag]] <- sprinkle_changes(scramble_phase = TRUE) %>%
make_tree_from_bi_snps() %>%
ggtree() %>%
final_bits()
}
Then put them in a grid:
scramble_grid <- plot_grid(
plotlist = phase_scrambled_trees,
labels = names(phase_scrambled_trees),
ncol = 2
)
## Warning: Removed 1 rows containing non-finite values (stat_tree_label).
## Warning: Removed 1 rows containing non-finite values (stat_tree_label).
## Warning: Removed 1 rows containing non-finite values (stat_tree_label).
## Warning: Removed 1 rows containing non-finite values (stat_tree_label).
## Warning: Removed 1 rows containing non-finite values (stat_tree_label).
## Warning: Removed 1 rows containing non-finite values (stat_tree_label).
# print that to an output PDF that is good sized
ggsave(scramble_grid,
filename = "outputs/204/phase-scrambled-tree-grid.pdf",
width = 14,
height = 20
)
# plot it smooshed up too:
scramble_grid
As predicted, that merely creates a smeary and spurious cluster of individuals that are heterozygous for the E and L lineage haplotypes. But, if you exclude them from you consideration, the trees are largely concordant with the original tree.
So, the conclusion there is that our original result is quite robust to phasing uncertainty.
sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 3.6.2 (2019-12-12)
## os macOS Sierra 10.12.6
## system x86_64, darwin15.6.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/Denver
## date 2020-05-14
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date lib source
## ape * 5.3 2019-03-17 [1] CRAN (R 3.6.0)
## assertthat 0.2.1 2019-03-21 [1] CRAN (R 3.6.0)
## backports 1.1.6 2020-04-05 [1] CRAN (R 3.6.2)
## BiocManager 1.30.10 2019-11-16 [1] CRAN (R 3.6.0)
## broom 0.5.6 2020-04-20 [1] CRAN (R 3.6.2)
## cellranger 1.1.0 2016-07-27 [1] CRAN (R 3.6.0)
## cli 2.0.2 2020-02-28 [1] CRAN (R 3.6.0)
## colorspace 1.4-1 2019-03-18 [1] CRAN (R 3.6.0)
## cowplot * 1.0.0 2019-07-11 [1] CRAN (R 3.6.0)
## crayon 1.3.4 2017-09-16 [1] CRAN (R 3.6.0)
## DBI 1.1.0 2019-12-15 [1] CRAN (R 3.6.0)
## dbplyr 1.4.3 2020-04-19 [1] CRAN (R 3.6.2)
## digest 0.6.25 2020-02-23 [1] CRAN (R 3.6.0)
## dplyr * 0.8.5 2020-03-07 [1] CRAN (R 3.6.0)
## ecaRbioinf * 0.1.0 2020-02-13 [1] local
## ellipsis 0.3.0 2019-09-20 [1] CRAN (R 3.6.0)
## evaluate 0.14 2019-05-28 [1] CRAN (R 3.6.0)
## fansi 0.4.1 2020-01-08 [1] CRAN (R 3.6.0)
## farver 2.0.3 2020-01-16 [1] CRAN (R 3.6.0)
## forcats * 0.5.0 2020-03-01 [1] CRAN (R 3.6.0)
## fs 1.4.1 2020-04-04 [1] CRAN (R 3.6.2)
## generics 0.0.2 2018-11-29 [1] CRAN (R 3.6.0)
## ggplot2 * 3.3.0 2020-03-05 [1] CRAN (R 3.6.0)
## ggtree * 2.0.4 2020-04-13 [1] Bioconductor
## glue 1.4.0 2020-04-03 [1] CRAN (R 3.6.2)
## gtable 0.3.0 2019-03-25 [1] CRAN (R 3.6.0)
## haven 2.2.0 2019-11-08 [1] CRAN (R 3.6.0)
## hms 0.5.3 2020-01-08 [1] CRAN (R 3.6.0)
## htmltools 0.4.0 2019-10-04 [1] CRAN (R 3.6.0)
## httr 1.4.1 2019-08-05 [1] CRAN (R 3.6.0)
## jsonlite 1.6.1 2020-02-02 [1] CRAN (R 3.6.0)
## knitr 1.28 2020-02-06 [1] CRAN (R 3.6.0)
## labeling 0.3 2014-08-23 [1] CRAN (R 3.6.0)
## lattice 0.20-38 2018-11-04 [2] CRAN (R 3.6.2)
## lazyeval 0.2.2 2019-03-15 [1] CRAN (R 3.6.0)
## lifecycle 0.2.0 2020-03-06 [1] CRAN (R 3.6.0)
## lubridate 1.7.8 2020-04-06 [1] CRAN (R 3.6.2)
## magrittr 1.5 2014-11-22 [1] CRAN (R 3.6.0)
## modelr 0.1.6 2020-02-22 [1] CRAN (R 3.6.0)
## munsell 0.5.0 2018-06-12 [1] CRAN (R 3.6.0)
## nlme 3.1-142 2019-11-07 [2] CRAN (R 3.6.2)
## pillar 1.4.3 2019-12-20 [1] CRAN (R 3.6.0)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 3.6.0)
## purrr * 0.3.4 2020-04-17 [1] CRAN (R 3.6.2)
## R6 2.4.1 2019-11-12 [1] CRAN (R 3.6.0)
## Rcpp 1.0.4 2020-03-17 [1] CRAN (R 3.6.0)
## readr * 1.3.1 2018-12-21 [1] CRAN (R 3.6.0)
## readxl 1.3.1 2019-03-13 [1] CRAN (R 3.6.0)
## reprex 0.3.0 2019-05-16 [1] CRAN (R 3.6.0)
## rlang 0.4.5 2020-03-01 [1] CRAN (R 3.6.0)
## rmarkdown 2.1 2020-01-20 [1] CRAN (R 3.6.0)
## rstudioapi 0.11 2020-02-07 [1] CRAN (R 3.6.0)
## rvcheck 0.1.8 2020-03-01 [1] CRAN (R 3.6.0)
## rvest 0.3.5 2019-11-08 [1] CRAN (R 3.6.0)
## scales 1.1.0 2019-11-18 [1] CRAN (R 3.6.0)
## sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 3.6.0)
## stringi 1.4.6 2020-02-17 [1] CRAN (R 3.6.0)
## stringr * 1.4.0 2019-02-10 [1] CRAN (R 3.6.0)
## tibble * 3.0.1 2020-04-20 [1] CRAN (R 3.6.2)
## tidyr * 1.0.2 2020-01-24 [1] CRAN (R 3.6.0)
## tidyselect 1.0.0 2020-01-27 [1] CRAN (R 3.6.0)
## tidytree 0.3.3 2020-04-02 [1] CRAN (R 3.6.2)
## tidyverse * 1.3.0 2019-11-21 [1] CRAN (R 3.6.0)
## treeio 1.10.0 2019-10-29 [1] Bioconductor
## vctrs 0.2.4 2020-03-10 [1] CRAN (R 3.6.0)
## withr 2.2.0 2020-04-20 [1] CRAN (R 3.6.2)
## xfun 0.13 2020-04-13 [1] CRAN (R 3.6.2)
## xml2 1.3.2 2020-04-23 [1] CRAN (R 3.6.2)
## yaml 2.2.1 2020-02-01 [1] CRAN (R 3.6.0)
##
## [1] /Users/eriq/Library/R/3.6/library
## [2] /Library/Frameworks/R.framework/Versions/3.6/Resources/library
Running the code and rendering this notebook required approximately this much time on a Mac laptop of middling speed:
Sys.time() - start_time
## Time difference of 6.887966 mins