Here we infer the ancestral recombination graph with RENT+ (Mirzaei and Wu 2016) and then make a nice plot with ‘ggtree’ (Yu et al. 2017)
library(tidyverse)
library(ape)
library(ggtree)
## Warning: package 'ggtree' was built under R version 3.6.3
dir.create("outputs/007", recursive = TRUE, showWarnings = FALSE)
dir.create("intermediates/007", recursive = TRUE, showWarnings = FALSE)
source("R/function_prep_elements.R")
big_haps2 <- read_rds(path = "outputs/004/big_haps2.rds")
Unfortunately, RENT+ does not seem to have a way to specify ancestral and derived alleles if you happen to know them (see Mirzaei and Wu 2016, p 1024). So we just make the non-spring associated alleles 0 and the others 1.
We will also run this on a 500 Kb chunk around GREB1L/ROCK1.
plist <- prep_elements(12.1e6, 12.6e6, 0.05e6, format = "%.2f", Return_D_only = TRUE)
# Let's call the fall-associated allele 0 and the spring-associated allele 1
for_input <- plist$D %>%
arrange(ecotype, lineage, pop, Indiv, haplo, haplo_name, POS) %>%
group_by(ecotype, lineage, pop, Indiv, haplo, haplo_name) %>%
mutate(zeroone = ifelse(alle2 == "F", 0L, 1L)) %>%
summarise(int_string = paste(zeroone, collapse = "")) %>%
ungroup() %>%
mutate(index = 1:n()) %>%
select(index, everything())
# now we can write that out
cat(sort(unique(plist$D$POS)), file = "intermediates/007/rent-input.txt", sep = " ", eol = "\n")
cat(for_input$int_string, sep = "\n", file = "intermediates/007/rent-input.txt", append = TRUE, eol = "\n")
Now, let’s try running that dude:
source script/java-jar-paths.sh
echo "Using RentPlus jar at: $RentPlus"
cd intermediates/007
java -jar $RentPlus -t rent-input.txt > rent-input.txt.stdout 2> rent-input.txt.stderr
awk '{print $0";"}' rent-input.txt.trees > rent-input.txt.ape-readable
## Using RentPlus jar at: /Users/eriq/Documents/others_code/RentPlus/RentPlus.jar
That turns out to take about five minutes.
tree_positions <- read_table2("intermediates/007/rent-input.txt.trees",
col_names = c("POS", "tree")
) %>%
select(POS)
all_trees <- read.tree("intermediates/007/rent-input.txt.ape-readable")
names(all_trees) <- tree_positions$POS
tmrcas <- read_table2("intermediates/007/rent-input.txt.Tmrcas", col_names = "Tmrca") %>%
bind_cols(tree_positions, .)
Let’s plot the local tree at position 12267547, which is one of our ROSA microhaplotype SNPs.
p <- ggtree(all_trees[["12267547"]]) %<+%
for_input +
scale_shape_manual(values = c(1, 19)) +
theme(legend.position = "right") # +
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
print(p)
Before we go further, I want to make a tree that has node labels so that I can look at those and use them for selecting different groups of haplotypes.
p_node_labels <- p +
geom_text2(aes(subset = !isTip, label = node), hjust = -.3) +
geom_tiplab()
print(p_node_labels)
Now, let’s add an ultra-zoom heatmap to it.
# first get the haplotype data.
long_ultra1 <- prep_elements(12.26e6, 12.29e6, 0.02e5, format = "%.3f", Return_D_only = TRUE)$D
To annotate each haplotype, figure out if it occurs in a fish that is heterozygous or not. We do that by counting up number of springer alleles on each haplotype.
First look at that distribution, and see that the haplotypes cluster into two groups on the basis of number of alleles most common amongst spring run fish.
Ssums <- long_ultra1 %>%
group_by(haplo_name, Indiv, pop, ecotype, lineage) %>%
summarise(NumSs = sum(alle2 == "S"))
ggplot(Ssums, aes(x = NumSs, fill = ecotype)) +
geom_histogram()
Pretty clearly 126 is a good dividing line. So, we can define heterozygotes on that basis, and we will also record population and basin.
# so, again, 126 is the sweetspot.
seq_meta <- Ssums %>%
group_by(Indiv) %>%
mutate(inHetFish = any(NumSs < 125) & any(NumSs >= 125)) %>%
mutate(Ecotype = recode(
ecotype,
"Fall" = "Fall Run",
"Spring" = "Spring Run",
"Late Fall" = "Late Fall Run",
"Winter" = "Winter Run"
)) %>%
mutate(Population = recode(
pop,
"Salmon_River" = "Salmon R.",
"Feather_River_Hatchery" = "Feather R.H.",
"Trinity_River_Hatchery" = "Trinity R.H.",
"San_Joaquin_River" = "San Joaquin R.",
"Coleman_Hatchery" = "Coleman H.",
"Sacramento_River" = "Sacramento R. Winter Run",
"Butte_Creek" = "Butte Ck."
)) %>%
mutate(Basin = recode(
lineage,
"Sacto" = "Sacramento Basin",
"Klamath" = "Klamath Basin"
))
Now we can make a matrix of values where each value takes up 7 columns, and then there will be a space. We will make this as a list which we can pass to cbind().
col_width <- 15
extra_mats <- list(
ecotype = matrix(rep(seq_meta$Ecotype, each = col_width), byrow = TRUE, ncol = col_width),
# sep1 = matrix(rep(NA, nrow(seq_meta)), ncol = 1),
het = matrix(rep(ifelse(seq_meta$inHetFish == TRUE, "Heterozygous", "Homozygous"), each = col_width),
byrow = TRUE,
ncol = col_width
),
# sep2 = matrix(rep(NA, nrow(seq_meta)), ncol = 1),
basin = matrix(rep(seq_meta$Basin, each = col_width), byrow = TRUE, ncol = col_width),
# sep3 = matrix(rep(NA, nrow(seq_meta)), ncol = 1),
# pop = matrix(rep(seq_meta$Population, each = col_width), byrow = TRUE, ncol = col_width),
sep4 = matrix(rep(NA, 3 * nrow(seq_meta)), ncol = 3)
) %>%
do.call(cbind, .)
rownames(extra_mats) <- seq_meta$haplo_name
ultra1 <- long_ultra1 %>%
select(haplo_name, POS, atypes) %>%
spread(POS, atypes)
ultra_mat <- as.matrix(ultra1[, -1])
rownames(ultra_mat) <- ultra1$haplo_name
ultra_sorted <- ultra_mat[for_input$haplo_name, ]
rownames(ultra_sorted) <- 1:nrow(ultra_sorted)
extra_sorted <- extra_mats[for_input$haplo_name, ]
rownames(extra_sorted) <- 1:nrow(extra_sorted)
colnames(extra_sorted) <- 1:ncol(extra_sorted)
combo_mat <- cbind(extra_sorted, ultra_sorted)
source("R/define_fcolors_all_sf.R")
fcolors_all_sf["Homozygous"] <- "black"
heat <- gheatmap(p, combo_mat,
offset = 0.2,
width = 0.8,
font.size = 0.8,
colnames_angle = -45,
hjust = 0, color = NA
) +
scale_fill_manual(values = fcolors_all_sf) +
theme(legend.position = "none")
## Warning: attributes are not identical across measure variables;
## they will be dropped
heat2 <- heat +
expand_limits(x = 27, y = 341) +
annotate("text", x = 96, y = 296, label = "Ecotype of Fish", size = 11, hjust = 1, angle = -45) +
annotate("text", x = 100.5, y = 296, label = "Fish RoSA Zygosity", size = 11, hjust = 1, angle = -45) +
annotate("text", x = 105, y = 296, label = "Basin/Lineage", size = 11, hjust = 1, angle = -45) +
annotate("text", x = 109, y = 299, label = "202 Variants on Chromosome 28 (12.26 to 12.29 Mb)", size = 10, hjust = 0)
# annotate("text", x = 113, y = 296, label = "Population", size = 12, hjust = 1, angle = -45)
# now, in order to get a legend like I would like to have, I think I am going to have
# to put it on there manually
ectx <- 45 # 10
fhx <- 80 # 45
blx <- 112 # 80
px <- 112
p1x <- 112
p2x <- 131
ty <- -16
ystep <- 5
rsize <- 3
xnudge <- 1
heat3 <- heat2 +
expand_limits(x = 27, y = -14) +
annotate("text", x = ectx, y = ty, size = 12, hjust = 0, label = "Ecotype of Fish") +
annotate("text", x = fhx, y = ty, size = 12, hjust = 0, label = "Fish RoSA Zygosity") +
annotate("text", x = blx, y = ty, size = 12, hjust = 0, label = "Basin/Lineage") +
# annotate("text", x = px, y = ty, size = 12, hjust = 0, label = "Population") +
#
# Now the Ecotype colors
#
annotate("rect", xmin = ectx, xmax = ectx + rsize, ymin = ty - ystep * 3, ymax = ty - ystep * 2, fill = "blue", colour = "black", size = 0.1) +
annotate("text", x = ectx + rsize + xnudge, y = ty - ystep * 2.5, size = 9, hjust = 0, vjust = 0.5, label = "Fall Run") +
annotate("rect", xmin = ectx, xmax = ectx + rsize, ymin = ty - ystep * 5, ymax = ty - ystep * 4, fill = "#a6cee3", colour = "black", size = 0.1) +
annotate("text", x = ectx + rsize + xnudge, y = ty - ystep * 4.5, size = 9, hjust = 0, vjust = 0.5, label = "Late Fall Run") +
annotate("rect", xmin = ectx, xmax = ectx + rsize, ymin = ty - ystep * 7, ymax = ty - ystep * 6, fill = "#ffff99", colour = "black", size = 0.1) +
annotate("text", x = ectx + rsize + xnudge, y = ty - ystep * 6.5, size = 9, hjust = 0, vjust = 0.5, label = "Winter Run") +
annotate("rect", xmin = ectx, xmax = ectx + rsize, ymin = ty - ystep * 9, ymax = ty - ystep * 8, fill = "gold", colour = "black", size = 0.1) +
annotate("text", x = ectx + rsize + xnudge, y = ty - ystep * 8.5, size = 9, hjust = 0, vjust = 0.5, label = "Spring Run") +
#
# Now the Heterozygous colors
#
annotate("rect", xmin = fhx, xmax = fhx + rsize, ymin = ty - ystep * 3, ymax = ty - ystep * 2, fill = "tan2", colour = "black", size = 0.1) +
annotate("text", x = fhx + rsize + xnudge, y = ty - ystep * 2.5, size = 9, hjust = 0, vjust = 0.5, label = "Heterozygous") +
annotate("rect", xmin = fhx, xmax = fhx + rsize, ymin = ty - ystep * 5, ymax = ty - ystep * 4, fill = "black", colour = "black", size = 0.1) +
annotate("text", x = fhx + rsize + xnudge, y = ty - ystep * 4.5, size = 9, hjust = 0, vjust = 0.5, label = "Homozygous") +
#
# Now the Basin colors
#
annotate("rect", xmin = blx, xmax = blx + rsize, ymin = ty - ystep * 3, ymax = ty - ystep * 2, fill = "#de2d26", colour = "black", size = 0.1) +
annotate("text", x = blx + rsize + xnudge, y = ty - ystep * 2.5, size = 9, hjust = 0, vjust = 0.5, label = "Sacramento Basin") +
annotate("rect", xmin = blx, xmax = blx + rsize, ymin = ty - ystep * 5, ymax = ty - ystep * 4, fill = "#74c476", colour = "black", size = 0.1) +
annotate("text", x = blx + rsize + xnudge, y = ty - ystep * 4.5, size = 9, hjust = 0, vjust = 0.5, label = "Klamath Basin") #+
#
# First column of Pop stuff (Klamath)
#
# annotate("rect", xmin = p1x, xmax = p1x + rsize, ymin = ty - ystep * 3, ymax = ty - ystep * 2, fill = "#238b45", colour = "black", size = 0.1) +
# annotate("text", x = p1x + rsize + xnudge, y = ty - ystep * 2.5, size = 9, hjust = 0, vjust = 0.5, label = "Trinity R.H.") +
# annotate("rect", xmin = p1x, xmax = p1x + rsize, ymin = ty - ystep * 5, ymax = ty - ystep * 4, fill = "#bae4b3", colour = "black", size = 0.1) +
# annotate("text", x = p1x + rsize + xnudge, y = ty - ystep * 4.5, size = 9, hjust = 0, vjust = 0.5, label = "Salmon R.") +
# #
# # Finally, the last column of populations
# #
# annotate("rect", xmin = p2x, xmax = p2x + rsize, ymin = ty - ystep * 3, ymax = ty - ystep * 2, fill = "#a50f15", colour = "black", size = 0.1) +
# annotate("text", x = p2x + rsize + xnudge, y = ty - ystep * 2.5, size = 9, hjust = 0, vjust = 0.5, label = "Sacramento R. Winter Run") +
# annotate("rect", xmin = p2x, xmax = p2x + rsize, ymin = ty - ystep * 5, ymax = ty - ystep * 4, fill = "#fb6a4a", colour = "black", size = 0.1) +
# annotate("text", x = p2x + rsize + xnudge, y = ty - ystep * 4.5, size = 9, hjust = 0, vjust = 0.5, label = "San Joaquin R.") +
# annotate("rect", xmin = p2x, xmax = p2x + rsize, ymin = ty - ystep * 7, ymax = ty - ystep * 6, fill = "#fc9272", colour = "black", size = 0.1) +
# annotate("text", x = p2x + rsize + xnudge, y = ty - ystep * 6.5, size = 9, hjust = 0, vjust = 0.5, label = "Butte Ck.") +
# annotate("rect", xmin = p2x, xmax = p2x + rsize, ymin = ty - ystep * 9, ymax = ty - ystep * 8, fill = "#fcbba1", colour = "black", size = 0.1) +
# annotate("text", x = p2x + rsize + xnudge, y = ty - ystep * 8.5, size = 9, hjust = 0, vjust = 0.5, label = "Coleman H.") +
# annotate("rect", xmin = p2x, xmax = p2x + rsize, ymin = ty - ystep * 11, ymax = ty - ystep * 10, fill = "#fee5d9", colour = "black", size = 0.1) +
# annotate("text", x = p2x + rsize + xnudge, y = ty - ystep * 10.5, size = 9, hjust = 0, vjust = 0.5, label = "Feather R.H.")
ggsave(heat3, filename = "outputs/007/ultra-heatmap.pdf", width = 30, height = 20)
To make the final figure for the paper, we used Inkscape to chop an equal-sized chunk out of each long branch and placed it atop the figure so that the figure did not include too much unused real estate.
We have Shiny App to visualize changes in the local coalescent trees within the RoSA. Here, we will write out what turns out to be the input for that into an rda and store it with the repo so anyone with RStudio can make that Shiny App. We retain the trees that extend from our left-most RoSA SNP assay to our right-most RoSA SNP assay
dir.create("coalescent-tree-shiny-app/data", showWarnings = FALSE, recursive = TRUE)
all_trees <- all_trees[as.integer(names(all_trees)) >= 12267547 & as.integer(names(all_trees)) <= 12281401]
save(for_input, all_trees, seq_meta, file = "coalescent-tree-shiny-app/data/coal-trees-etc-for-shiny.rda", compress = "xz")
sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 3.6.2 (2019-12-12)
## os macOS Sierra 10.12.6
## system x86_64, darwin15.6.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/Denver
## date 2020-05-14
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date lib source
## ape * 5.3 2019-03-17 [1] CRAN (R 3.6.0)
## assertthat 0.2.1 2019-03-21 [1] CRAN (R 3.6.0)
## backports 1.1.6 2020-04-05 [1] CRAN (R 3.6.2)
## BiocManager 1.30.10 2019-11-16 [1] CRAN (R 3.6.0)
## broom 0.5.6 2020-04-20 [1] CRAN (R 3.6.2)
## cellranger 1.1.0 2016-07-27 [1] CRAN (R 3.6.0)
## cli 2.0.2 2020-02-28 [1] CRAN (R 3.6.0)
## colorspace 1.4-1 2019-03-18 [1] CRAN (R 3.6.0)
## crayon 1.3.4 2017-09-16 [1] CRAN (R 3.6.0)
## DBI 1.1.0 2019-12-15 [1] CRAN (R 3.6.0)
## dbplyr 1.4.3 2020-04-19 [1] CRAN (R 3.6.2)
## digest 0.6.25 2020-02-23 [1] CRAN (R 3.6.0)
## dplyr * 0.8.5 2020-03-07 [1] CRAN (R 3.6.0)
## ellipsis 0.3.0 2019-09-20 [1] CRAN (R 3.6.0)
## evaluate 0.14 2019-05-28 [1] CRAN (R 3.6.0)
## fansi 0.4.1 2020-01-08 [1] CRAN (R 3.6.0)
## farver 2.0.3 2020-01-16 [1] CRAN (R 3.6.0)
## forcats * 0.5.0 2020-03-01 [1] CRAN (R 3.6.0)
## fs 1.4.1 2020-04-04 [1] CRAN (R 3.6.2)
## generics 0.0.2 2018-11-29 [1] CRAN (R 3.6.0)
## ggplot2 * 3.3.0 2020-03-05 [1] CRAN (R 3.6.0)
## ggtree * 2.0.4 2020-04-13 [1] Bioconductor
## glue 1.4.0 2020-04-03 [1] CRAN (R 3.6.2)
## gtable 0.3.0 2019-03-25 [1] CRAN (R 3.6.0)
## haven 2.2.0 2019-11-08 [1] CRAN (R 3.6.0)
## hms 0.5.3 2020-01-08 [1] CRAN (R 3.6.0)
## htmltools 0.4.0 2019-10-04 [1] CRAN (R 3.6.0)
## httr 1.4.1 2019-08-05 [1] CRAN (R 3.6.0)
## jsonlite 1.6.1 2020-02-02 [1] CRAN (R 3.6.0)
## knitr 1.28 2020-02-06 [1] CRAN (R 3.6.0)
## labeling 0.3 2014-08-23 [1] CRAN (R 3.6.0)
## lattice 0.20-38 2018-11-04 [2] CRAN (R 3.6.2)
## lazyeval 0.2.2 2019-03-15 [1] CRAN (R 3.6.0)
## lifecycle 0.2.0 2020-03-06 [1] CRAN (R 3.6.0)
## lubridate 1.7.8 2020-04-06 [1] CRAN (R 3.6.2)
## magrittr 1.5 2014-11-22 [1] CRAN (R 3.6.0)
## modelr 0.1.6 2020-02-22 [1] CRAN (R 3.6.0)
## munsell 0.5.0 2018-06-12 [1] CRAN (R 3.6.0)
## nlme 3.1-142 2019-11-07 [2] CRAN (R 3.6.2)
## pillar 1.4.3 2019-12-20 [1] CRAN (R 3.6.0)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 3.6.0)
## purrr * 0.3.4 2020-04-17 [1] CRAN (R 3.6.2)
## R6 2.4.1 2019-11-12 [1] CRAN (R 3.6.0)
## Rcpp 1.0.4 2020-03-17 [1] CRAN (R 3.6.0)
## readr * 1.3.1 2018-12-21 [1] CRAN (R 3.6.0)
## readxl 1.3.1 2019-03-13 [1] CRAN (R 3.6.0)
## reprex 0.3.0 2019-05-16 [1] CRAN (R 3.6.0)
## rlang 0.4.5 2020-03-01 [1] CRAN (R 3.6.0)
## rmarkdown 2.1 2020-01-20 [1] CRAN (R 3.6.0)
## rstudioapi 0.11 2020-02-07 [1] CRAN (R 3.6.0)
## rvcheck 0.1.8 2020-03-01 [1] CRAN (R 3.6.0)
## rvest 0.3.5 2019-11-08 [1] CRAN (R 3.6.0)
## scales 1.1.0 2019-11-18 [1] CRAN (R 3.6.0)
## sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 3.6.0)
## stringi 1.4.6 2020-02-17 [1] CRAN (R 3.6.0)
## stringr * 1.4.0 2019-02-10 [1] CRAN (R 3.6.0)
## tibble * 3.0.1 2020-04-20 [1] CRAN (R 3.6.2)
## tidyr * 1.0.2 2020-01-24 [1] CRAN (R 3.6.0)
## tidyselect 1.0.0 2020-01-27 [1] CRAN (R 3.6.0)
## tidytree 0.3.3 2020-04-02 [1] CRAN (R 3.6.2)
## tidyverse * 1.3.0 2019-11-21 [1] CRAN (R 3.6.0)
## treeio 1.10.0 2019-10-29 [1] Bioconductor
## vctrs 0.2.4 2020-03-10 [1] CRAN (R 3.6.0)
## withr 2.2.0 2020-04-20 [1] CRAN (R 3.6.2)
## xfun 0.13 2020-04-13 [1] CRAN (R 3.6.2)
## xml2 1.3.2 2020-04-23 [1] CRAN (R 3.6.2)
## yaml 2.2.1 2020-02-01 [1] CRAN (R 3.6.0)
##
## [1] /Users/eriq/Library/R/3.6/library
## [2] /Library/Frameworks/R.framework/Versions/3.6/Resources/library
Running the code and rendering this notebook required approximately this much time on a Mac laptop of middling speed:
Sys.time() - start_time
## Time difference of 5.066844 mins
Mirzaei, Sajad, and Yufeng Wu. 2016. “RENT+: An Improved Method for Inferring Local Genealogical Trees from Haplotypes with Recombination.” Bioinformatics 33 (7). Oxford University Press: 1021–30.
Yu, Guangchuang, David K. Smith, Huachen Zhu, Yi Guan, and Tommy Tsan-Yuk Lam. 2017. “Ggtree: An R Package for Visualization and Annotation of Phylogenetic Trees with Their Covariates and Other Associated Data.” Methods in Ecology and Evolution 8 (1): 28–36. doi:10.1111/2041-210X.12628.