library(tidyverse)
library(ecaRbioinf)
library(ape)
library(ggtree)
## Warning: package 'ggtree' was built under R version 3.6.3
library(cowplot)

dir.create("outputs/204", recursive = TRUE, showWarnings = FALSE)

First, prepare functions for rapidly plotting trees

Get the data that we will be working with

haps_bi_rosa <- read_rds("outputs/009/haps_bi_rosa.rds")
sqm2_jc <- read_rds("outputs/009/sqm2_jc.rds")

A function for formatting plotted trees

source("R/define_fcolors_all_sf.R")

final_bits <- function(g, size = 2, stroke = 0.2) {
  g +
    geom_tippoint(aes(fill = Ecotype, shape = `Fish Genotype`, colour = Basin), size = size, stroke = stroke) +
    theme(legend.position = "right") +
    scale_shape_manual(values = c(25, 21, 22)) +
    scale_fill_manual(values = fcolors_all_sf) +
    scale_colour_manual(values = fcolors_all_sf) +
    guides(
      fill = guide_legend(override.aes = list(shape = 21, stroke = 0.1)),
      colour = guide_legend(override.aes = list(shape = 21, stroke = 1.1, fill = NA)),
      shape = guide_legend(title = "Fish Genotype")
    )
}

A function for making trees

Now, write a function that processes everything and produces a neighbor-joining tree

make_tree_from_bi_snps <- function(
                                   haps_bi) {

  # make a FASTA of the sequences (inserting the variants into them)
  tmpfasta <- str_c(tempfile(), ".fa")
  dump <- ecaRbioinf::insert_variants_into_sequences(
    V = haps_bi,
    fasta = "intermediates/009/NC_037124.1_fragment_chinook.fna.gz",
    lo = 12260034,
    hi = 12289484,
    file = tmpfasta
  )


  # read in the fastas, compute distances, and make a treeio neighbor-joining tree of it
  seqs <- ape::read.FASTA(tmpfasta)
  seqdist <- ape::dist.dna(seqs)
  nj <- ape::nj(seqdist) %>%
    treeio::as.treedata()
  tree <- treeio::full_join(nj, sqm2_jc, by = "label")
  tree
}

Watch it in action

tree <- make_tree_from_bi_snps(haps_bi_rosa)
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
final_bits(ggtree(tree))
## Warning: Removed 1 rows containing non-finite values (stat_tree_label).

Now, a function for sprinkling errors amongst the haplotypes

Get the error rates we will use:

rates <- read_rds("outputs/204/est_rates_for_simulation.rds")
rates

Now we need a function to create those types of changes to the genotypes in the phased haplotypes at those rates. The genotypes are coded via DNA bases, not as 0, 1, 2, but we have the REF and ALT in each case, so we can figure it out.

# first, get those rates into a list
Ra <- list()
Ra[["0"]] <- arrange(rates, simulated_new_genotype) %>%
  filter(imputed_genotype == 0) %>%
  pull(rate)
Ra[["1"]] <- arrange(rates, simulated_new_genotype) %>%
  filter(imputed_genotype == 1) %>%
  pull(rate)
Ra[["2"]] <- arrange(rates, simulated_new_genotype) %>%
  filter(imputed_genotype == 2) %>%
  pull(rate)

sprinkle_changes <- function(D = haps_bi_rosa, rates = rates, scramble_phase = FALSE) {
  tmp <- D %>%
    select(-anc_vs_derived, -freq, -spring_allele, -freq, -alle2) %>%
    pivot_wider(names_from = c(haplo), values_from = c(allele, haplo_name)) %>%
    mutate(g012 = case_when(
      allele_a != allele_b ~ 1L,
      allele_a == allele_b & allele_a == REF ~ 0L,
      allele_a == allele_b & allele_a == ALT ~ 2L,
      TRUE ~ 999L
    )) %>%
    mutate(new012 = case_when(
      g012 == 0 ~ sample(x = 0:2, size = n(), prob = Ra[["0"]], replace = TRUE),
      g012 == 1 ~ sample(x = 0:2, size = n(), prob = Ra[["1"]], replace = TRUE),
      g012 == 2 ~ sample(x = 0:2, size = n(), prob = Ra[["2"]], replace = TRUE),
      TRUE ~ 999L
    ))

  #' check that is correct to this point.  (It looks correct.)
  check <- tmp %>%
    count(g012, new012) %>%
    group_by(g012) %>%
    mutate(freq = n / sum(n))

  # now, we just need to translate those back to SNP bases in haplotypes.
  # Note that newly-minted heterozygotes will be randomized with
  # respect to phase

  tmp2 <- tmp %>%
    mutate(
      orig_hap = str_c(allele_a, "|", allele_b),
      ra_het = str_c(REF, "|", ALT),
      ar_het = str_c(ALT, "|", REF),
      scrambled_het = ifelse(runif(n()) < 0.5, ra_het, ar_het),
      rr_hom = str_c(REF, "|", REF),
      aa_hom = str_c(ALT, "|", ALT)
    ) %>%
    select(-ra_het, -ar_het) %>%
    mutate(new_hap = case_when(
      g012 == new012 ~ orig_hap,
      new012 == 0 ~ rr_hom,
      new012 == 1 ~ scrambled_het,
      new012 == 2 ~ aa_hom
    ))

  # here we scramble the phase of ALL heterozygous sites, if requested
  if (scramble_phase == TRUE) {
    tmp2 <- tmp2 %>%
      mutate(new_hap = ifelse(new012 == 1, scrambled_het, new_hap))
  }


  # now, we pivot_longer our data frame back to something we can use to make the trees
  tmp2 %>%
    select(-(g012:aa_hom)) %>%
    separate(new_hap, into = c("new_a", "new_b"), sep = "\\|") %>%
    mutate(
      allele_a = new_a,
      allele_b = new_b
    ) %>%
    select(-new_a, -new_b, -haplo_name_a, -haplo_name_b) %>%
    pivot_longer(
      cols = allele_a:allele_b,
      names_to = "haplo",
      values_to = "allele"
    ) %>%
    mutate(haplo = str_replace(haplo, "allele_", "")) %>%
    mutate(haplo_name = str_c(Indiv, "-", haplo))
}

Here is a demo of its use:

new_haps <- sprinkle_changes()

new_tree <- make_tree_from_bi_snps(new_haps)

final_bits(ggtree(new_tree))
## Warning: Removed 1 rows containing non-finite values (stat_tree_label).

Note that looks a lot like the original tree, with just some extra length on the final twigs.

Make six trees: original + five with simulated error

Make the plots.

phase_retained_trees <- list(
  OriginalTree = make_tree_from_bi_snps(haps_bi_rosa) %>%
    ggtree() %>%
    final_bits()
)
set.seed(5)
for (i in 2:6) {
  Tag <- str_c("Simulated_Error_Rep_", i - 1)
  phase_retained_trees[[Tag]] <- sprinkle_changes() %>%
    make_tree_from_bi_snps() %>%
    ggtree() %>%
    final_bits()
}

Then put them in a grid:

tree_grid <- plot_grid(
  plotlist = phase_retained_trees,
  labels = names(phase_retained_trees),
  ncol = 2
)
## Warning: Removed 1 rows containing non-finite values (stat_tree_label).

## Warning: Removed 1 rows containing non-finite values (stat_tree_label).

## Warning: Removed 1 rows containing non-finite values (stat_tree_label).

## Warning: Removed 1 rows containing non-finite values (stat_tree_label).

## Warning: Removed 1 rows containing non-finite values (stat_tree_label).

## Warning: Removed 1 rows containing non-finite values (stat_tree_label).
# print that to an output PDF that is good sized
ggsave(tree_grid,
  filename = "outputs/204/phase-remained-tree-grid.pdf",
  width = 14,
  height = 20
)

# plot it smooshed up too:
tree_grid

Now, make 6 trees with random phasing of heterozygotes

This is a little odd because most of the genotypes are homozygous except for the individuals that are hets for E and L lineage haplotypes. So, those hets are going to be pretty wonky and that might screw things up, but let us see:

phase_scrambled_trees <- list(
  OriginalTree = make_tree_from_bi_snps(haps_bi_rosa) %>%
    ggtree() %>%
    final_bits()
)
set.seed(5)
for (i in 2:6) {
  Tag <- str_c("Simulated_Error/PhaseScramble_Rep_", i - 1)
  phase_scrambled_trees[[Tag]] <- sprinkle_changes(scramble_phase = TRUE) %>%
    make_tree_from_bi_snps() %>%
    ggtree() %>%
    final_bits()
}

Then put them in a grid:

scramble_grid <- plot_grid(
  plotlist = phase_scrambled_trees,
  labels = names(phase_scrambled_trees),
  ncol = 2
)
## Warning: Removed 1 rows containing non-finite values (stat_tree_label).

## Warning: Removed 1 rows containing non-finite values (stat_tree_label).

## Warning: Removed 1 rows containing non-finite values (stat_tree_label).

## Warning: Removed 1 rows containing non-finite values (stat_tree_label).

## Warning: Removed 1 rows containing non-finite values (stat_tree_label).

## Warning: Removed 1 rows containing non-finite values (stat_tree_label).
# print that to an output PDF that is good sized
ggsave(scramble_grid,
  filename = "outputs/204/phase-scrambled-tree-grid.pdf",
  width = 14,
  height = 20
)

# plot it smooshed up too:
scramble_grid

As predicted, that merely creates a smeary and spurious cluster of individuals that are heterozygous for the E and L lineage haplotypes. But, if you exclude them from you consideration, the trees are largely concordant with the original tree.

So, the conclusion there is that our original result is quite robust to phasing uncertainty.

Session Info

sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value                       
##  version  R version 3.6.2 (2019-12-12)
##  os       macOS Sierra 10.12.6        
##  system   x86_64, darwin15.6.0        
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       America/Denver              
##  date     2020-05-14                  
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package     * version date       lib source        
##  ape         * 5.3     2019-03-17 [1] CRAN (R 3.6.0)
##  assertthat    0.2.1   2019-03-21 [1] CRAN (R 3.6.0)
##  backports     1.1.6   2020-04-05 [1] CRAN (R 3.6.2)
##  BiocManager   1.30.10 2019-11-16 [1] CRAN (R 3.6.0)
##  broom         0.5.6   2020-04-20 [1] CRAN (R 3.6.2)
##  cellranger    1.1.0   2016-07-27 [1] CRAN (R 3.6.0)
##  cli           2.0.2   2020-02-28 [1] CRAN (R 3.6.0)
##  colorspace    1.4-1   2019-03-18 [1] CRAN (R 3.6.0)
##  cowplot     * 1.0.0   2019-07-11 [1] CRAN (R 3.6.0)
##  crayon        1.3.4   2017-09-16 [1] CRAN (R 3.6.0)
##  DBI           1.1.0   2019-12-15 [1] CRAN (R 3.6.0)
##  dbplyr        1.4.3   2020-04-19 [1] CRAN (R 3.6.2)
##  digest        0.6.25  2020-02-23 [1] CRAN (R 3.6.0)
##  dplyr       * 0.8.5   2020-03-07 [1] CRAN (R 3.6.0)
##  ecaRbioinf  * 0.1.0   2020-02-13 [1] local         
##  ellipsis      0.3.0   2019-09-20 [1] CRAN (R 3.6.0)
##  evaluate      0.14    2019-05-28 [1] CRAN (R 3.6.0)
##  fansi         0.4.1   2020-01-08 [1] CRAN (R 3.6.0)
##  farver        2.0.3   2020-01-16 [1] CRAN (R 3.6.0)
##  forcats     * 0.5.0   2020-03-01 [1] CRAN (R 3.6.0)
##  fs            1.4.1   2020-04-04 [1] CRAN (R 3.6.2)
##  generics      0.0.2   2018-11-29 [1] CRAN (R 3.6.0)
##  ggplot2     * 3.3.0   2020-03-05 [1] CRAN (R 3.6.0)
##  ggtree      * 2.0.4   2020-04-13 [1] Bioconductor  
##  glue          1.4.0   2020-04-03 [1] CRAN (R 3.6.2)
##  gtable        0.3.0   2019-03-25 [1] CRAN (R 3.6.0)
##  haven         2.2.0   2019-11-08 [1] CRAN (R 3.6.0)
##  hms           0.5.3   2020-01-08 [1] CRAN (R 3.6.0)
##  htmltools     0.4.0   2019-10-04 [1] CRAN (R 3.6.0)
##  httr          1.4.1   2019-08-05 [1] CRAN (R 3.6.0)
##  jsonlite      1.6.1   2020-02-02 [1] CRAN (R 3.6.0)
##  knitr         1.28    2020-02-06 [1] CRAN (R 3.6.0)
##  labeling      0.3     2014-08-23 [1] CRAN (R 3.6.0)
##  lattice       0.20-38 2018-11-04 [2] CRAN (R 3.6.2)
##  lazyeval      0.2.2   2019-03-15 [1] CRAN (R 3.6.0)
##  lifecycle     0.2.0   2020-03-06 [1] CRAN (R 3.6.0)
##  lubridate     1.7.8   2020-04-06 [1] CRAN (R 3.6.2)
##  magrittr      1.5     2014-11-22 [1] CRAN (R 3.6.0)
##  modelr        0.1.6   2020-02-22 [1] CRAN (R 3.6.0)
##  munsell       0.5.0   2018-06-12 [1] CRAN (R 3.6.0)
##  nlme          3.1-142 2019-11-07 [2] CRAN (R 3.6.2)
##  pillar        1.4.3   2019-12-20 [1] CRAN (R 3.6.0)
##  pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 3.6.0)
##  purrr       * 0.3.4   2020-04-17 [1] CRAN (R 3.6.2)
##  R6            2.4.1   2019-11-12 [1] CRAN (R 3.6.0)
##  Rcpp          1.0.4   2020-03-17 [1] CRAN (R 3.6.0)
##  readr       * 1.3.1   2018-12-21 [1] CRAN (R 3.6.0)
##  readxl        1.3.1   2019-03-13 [1] CRAN (R 3.6.0)
##  reprex        0.3.0   2019-05-16 [1] CRAN (R 3.6.0)
##  rlang         0.4.5   2020-03-01 [1] CRAN (R 3.6.0)
##  rmarkdown     2.1     2020-01-20 [1] CRAN (R 3.6.0)
##  rstudioapi    0.11    2020-02-07 [1] CRAN (R 3.6.0)
##  rvcheck       0.1.8   2020-03-01 [1] CRAN (R 3.6.0)
##  rvest         0.3.5   2019-11-08 [1] CRAN (R 3.6.0)
##  scales        1.1.0   2019-11-18 [1] CRAN (R 3.6.0)
##  sessioninfo   1.1.1   2018-11-05 [1] CRAN (R 3.6.0)
##  stringi       1.4.6   2020-02-17 [1] CRAN (R 3.6.0)
##  stringr     * 1.4.0   2019-02-10 [1] CRAN (R 3.6.0)
##  tibble      * 3.0.1   2020-04-20 [1] CRAN (R 3.6.2)
##  tidyr       * 1.0.2   2020-01-24 [1] CRAN (R 3.6.0)
##  tidyselect    1.0.0   2020-01-27 [1] CRAN (R 3.6.0)
##  tidytree      0.3.3   2020-04-02 [1] CRAN (R 3.6.2)
##  tidyverse   * 1.3.0   2019-11-21 [1] CRAN (R 3.6.0)
##  treeio        1.10.0  2019-10-29 [1] Bioconductor  
##  vctrs         0.2.4   2020-03-10 [1] CRAN (R 3.6.0)
##  withr         2.2.0   2020-04-20 [1] CRAN (R 3.6.2)
##  xfun          0.13    2020-04-13 [1] CRAN (R 3.6.2)
##  xml2          1.3.2   2020-04-23 [1] CRAN (R 3.6.2)
##  yaml          2.2.1   2020-02-01 [1] CRAN (R 3.6.0)
## 
## [1] /Users/eriq/Library/R/3.6/library
## [2] /Library/Frameworks/R.framework/Versions/3.6/Resources/library

Running Time

Running the code and rendering this notebook required approximately this much time on a Mac laptop of middling speed:

Sys.time() - start_time
## Time difference of 6.887966 mins

Citations