library(tidyverse)
library(genoscapeRtools)
library(SNPRelate)
library(whoa)
dir.create("outputs/001", recursive = TRUE, showWarnings = FALSE)
rad <- read_rds(file = "data/rad_wifl_clean_175_105000.rds")
# number of individuals and SNPs in this matrix:
dim(rad)
## [1] 175 105000
First, get the collection locations so we can grab all the INW birds, which we largely expect to be in HWE proportions.
meta_full <- read_csv("data/appendices/app1-SITable1_Breed_IndvAssigwgenos.csv")
inw_ids <- meta_full %>%
filter(repunit == "INW", geno_method == "RAD") %>%
pull(Field_Number)
inw_rad <- rad[inw_ids, ]
Then compute expected vs observed genotype freqs in one of the large populations and plot those.
geno_freqs <- whoa::exp_and_obs_geno_freqs(d012 = t(inw_rad))
# now plot them, but don't jitter them at all
g <- ggplot(geno_freqs, aes(x = p_exp, y = p_obs, colour = geno)) +
geom_point(alpha = 0.1) +
facet_wrap(~ geno, nrow = 1) +
geom_polygon(data = geno_freq_boundaries(), fill = NA, linetype = "dashed", colour = "black") +
geom_abline(slope = 1, intercept = 0, linetype = "solid")
g
The obvious paralogs fall along the bottom boundary (dotted line). Those are paralogs that are fixed for different alleles, or one which is fixed and the other segregating for variation. We can roughly pick them out like this:
likely_paralogs <- geno_freqs %>%
group_by(snp) %>%
mutate(is_likely_paralog = {
ret <- FALSE
if (
((p_exp[1] > 0.125 & p_exp[1] < 0.3) & p_obs[1] < 0.02) | # the 0 homozygote geno category
((p_exp[3] > 0.125 & p_exp[3] < 0.3) & p_obs[3] < 0.02) | # the 1 homozygote geno category
((p_exp[3] > 0.21 & p_exp[3] < 0.3) & p_obs[3] < 0.06) # the extra blip of the 1 homozygote category
) {
ret <- TRUE
}
ret
}) %>%
filter(is_likely_paralog) %>%
ungroup()
Let's plot them in black to see them on the original plot:
g + geom_point(
data = likely_paralogs,
colour = "black"
)
Get the ids of those likely paralogs:
paralog_ids <- unique(likely_paralogs$snp)
# how many are there?
length(paralog_ids)
## [1] 289
Remove those likely paralogs from the data set:
rad2 <- rad[, setdiff(colnames(rad), paralog_ids)]
Any sites with a frequency of 0 or 1 should be tossed. Let's compute the allele frequencies and find any such sites:
rad2NA <- rad2
rad2NA[rad2 == -1] <- NA
freqs <- colMeans(rad2NA, na.rm = TRUE)
monomorph <- freqs[near(freqs, 0) | near(freqs, 1)]
monomorph_ids <- names(monomorph)
# how many alleles are monomorphic
length(monomorph_ids)
## [1] 85
So, finally retain only the non-monomorphic ones:
rad3 <- rad2[, setdiff(colnames(rad2), monomorph_ids)]
# how many?
dim(rad3)
## [1] 175 104626
Missing fraction missing data in each individual:
miss_fract_within_indivs <- rowMeans(rad3 == -1)
# max missing within an individual
max(miss_fract_within_indivs)
## [1] 0.1559268
# mean missing within individuals
mean(miss_fract_within_indivs)
## [1] 0.02347627
Fraction of individuals missing data at each SNP:
miss_fract_at_each_snp <- colMeans(rad3 == -1)
# max missing at a SNP
max(miss_fract_at_each_snp)
## [1] 0.07428571
# mean missing across all SNPs
mean(miss_fract_at_each_snp)
## [1] 0.02347627
Look at the allele frequencies:
radNA <- rad3
radNA[rad3 == -1] <- NA
freqs <- colMeans(radNA, na.rm = TRUE) / 2
hist(freqs, breaks = seq(0,1, by = 0.01))
First PCA:
pca <- genoscape_pca(
dat012 = rad3
)
## Principal Component Analysis (PCA) on genotypes:
## Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
## # of samples: 175
## # of SNPs: 104,626
## using 1 thread
## # of principal components: 32
## PCA: the sum of all selected genotypes (0,1,2) = 5992010
## CPU capabilities: Double-Precision SSE2
## Thu Mar 4 11:10:53 2021 (internal increment: 4280)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 1s
## Thu Mar 4 11:10:54 2021 Begin (eigenvalues and eigenvectors)
## Thu Mar 4 11:10:54 2021 Done.
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
# and plot the first two principal components
ggplot(pca$pca_df, aes(x = `PC-01`, y = `PC-02`)) +
geom_point(shape = 1)
We will color the points on the above plot according to the collection locations of the birds.
meta <- meta_full %>%
select(Field_Number, repunit)
pca2 <- pca$pca_df %>%
left_join(meta, by = c("sample" = "Field_Number"))
# get the cluster_colors:
source("R/colors.R")
g <- ggplot(
data = pca2,
mapping = aes(
x = `PC-01`,
y = `PC-02`,
fill = repunit
)
) +
geom_point(shape = 21) +
scale_fill_manual(
values = cluster_colors,
name = "Collection\nLocation"
)
g
There is fairly evident structure.
The proportion variance explained by the first two PCs is:
pca$proportion_variance[1:2]
## [1] 0.03614243 0.02131370
The final figure for the paper was reflected (so that positions of clusters correspond better to geography) and "prettified" using image editing software.
Thisis a single step with genoscapeRtools:
sample_tibble <- meta %>%
filter(Field_Number %in% rownames(rad3)) %>%
rename(sample = Field_Number, group = repunit)
FST <- pairwise_fst(rad3, sample_groups = sample_tibble)
Now, let's just write those results out to a CSV file. Specifically we want to put the Fst values and the n values in tables:
fst_table <- FST$Fst %>%
pivot_wider(
id_cols = pop1,
names_from = pop2,
values_from = Fst
)
ns_table <- FST$Fst %>%
mutate(ns = str_c(n_pop1, ",", n_pop2)) %>%
pivot_wider(
id_cols = pop1,
names_from = pop2,
values_from = ns
)
# and write these out
write_csv(fst_table, "outputs/001/rad-fst-table.csv")
write_csv(ns_table, "outputs/001/rad-fst-sample-sizes-table.csv")
Running the code and rendering this notebook required approximately this much time on a Mac laptop of middling speed, listed in HH:MM:SS
.
td <- Sys.time() - start_time
tdf <- hms::as_hms(ceiling(hms::as_hms(td)))
dir.create("stored_run_times", showWarnings = FALSE, recursive = TRUE)
write_rds(tdf, "stored_run_times/001.rds")
tdf
## 00:00:59
sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.0.3 (2020-10-10)
## os macOS Mojave 10.14.6
## system x86_64, darwin17.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/Denver
## date 2021-03-04
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date lib
## assertthat 0.2.1 2019-03-21 [1]
## backports 1.2.1 2020-12-09 [1]
## broom 0.7.3 2020-12-16 [1]
## cellranger 1.1.0 2016-07-27 [1]
## cli 2.2.0 2020-11-20 [1]
## colorspace 2.0-0 2020-11-11 [1]
## crayon 1.3.4 2017-09-16 [1]
## DBI 1.1.0 2019-12-15 [1]
## dbplyr 2.0.0 2020-11-03 [1]
## digest 0.6.27 2020-10-24 [1]
## dplyr * 1.0.2 2020-08-18 [1]
## ellipsis 0.3.1 2020-05-15 [1]
## evaluate 0.14 2019-05-28 [1]
## fansi 0.4.1 2020-01-08 [1]
## farver 2.0.3 2020-01-16 [1]
## forcats * 0.5.0 2020-03-01 [1]
## fs 1.5.0 2020-07-31 [1]
## gdsfmt * 1.26.1 2020-12-22 [1]
## generics 0.1.0 2020-10-31 [1]
## genoscapeRtools * 0.1.0 2021-01-17 [1]
## ggplot2 * 3.3.2 2020-06-19 [1]
## glue 1.4.2 2020-08-27 [1]
## gtable 0.3.0 2019-03-25 [1]
## haven 2.3.1 2020-06-01 [1]
## hms 0.5.3 2020-01-08 [1]
## htmltools 0.5.0 2020-06-16 [1]
## httr 1.4.2 2020-07-20 [1]
## jsonlite 1.7.2 2020-12-09 [1]
## knitr 1.30 2020-09-22 [1]
## labeling 0.4.2 2020-10-20 [1]
## lifecycle 0.2.0 2020-03-06 [1]
## lubridate 1.7.9.2 2020-11-13 [1]
## magrittr 2.0.1 2020-11-17 [1]
## modelr 0.1.8 2020-05-19 [1]
## munsell 0.5.0 2018-06-12 [1]
## pillar 1.4.7 2020-11-20 [1]
## pkgconfig 2.0.3 2019-09-22 [1]
## ps 1.5.0 2020-12-05 [1]
## purrr * 0.3.4 2020-04-17 [1]
## R6 2.5.0 2020-10-28 [1]
## Rcpp 1.0.5 2020-07-06 [1]
## readr * 1.4.0 2020-10-05 [1]
## readxl 1.3.1 2019-03-13 [1]
## reprex 0.3.0 2019-05-16 [1]
## rlang 0.4.9 2020-11-26 [1]
## rmarkdown 2.6 2020-12-14 [1]
## rstudioapi 0.13 2020-11-12 [1]
## rvest 0.3.6 2020-07-25 [1]
## scales 1.1.1 2020-05-11 [1]
## sessioninfo 1.1.1 2018-11-05 [1]
## SNPRelate * 1.24.0 2020-10-27 [1]
## stringi 1.5.3 2020-09-09 [1]
## stringr * 1.4.0 2019-02-10 [1]
## tibble * 3.0.4 2020-10-12 [1]
## tidyr * 1.1.2 2020-08-27 [1]
## tidyselect 1.1.0 2020-05-11 [1]
## tidyverse * 1.3.0 2019-11-21 [1]
## vctrs 0.3.6 2020-12-17 [1]
## whoa * 0.0.2.999 2021-01-28 [1]
## withr 2.3.0 2020-09-22 [1]
## xfun 0.19 2020-10-30 [1]
## xml2 1.3.2 2020-04-23 [1]
## yaml 2.2.1 2020-02-01 [1]
## source
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.1)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## Bioconductor
## CRAN (R 4.0.2)
## Github (eriqande/genoscapeRtools@3e1dbfc)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## Bioconductor
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## Github (eriqande/whoa@dddbeea)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
## CRAN (R 4.0.2)
##
## [1] /Users/eriq/Library/R/4.0/library
## [2] /Library/Frameworks/R.framework/Versions/4.0/Resources/library