Construct maps of the Salmon River basin and place carcass survey / RoSA genotype data on them. Compute relative frequencies of RoSA genotypes amongst spawners.
library(tidyverse)
library(sf)
library(lubridate)
library(parallel)
dir.create("outputs/010", recursive = TRUE, showWarnings = FALSE)
# grab and filter to the KLAMATH, and within that the Salmon River
salmon_bas <- st_read("geo-spatial/GISlayers_calw221_shp/calw221.shp") %>%
filter(str_detect(HUNAME, "KLAMATH")) %>% # get just Klamath Basin
filter(HANAME == "Salmon River") %>% # narrow it down to just Salmon River
st_union() %>% # Keep just the outer basin boundary (not the trib basins within)
st_transform(crs = 4326)
## Reading layer `calw221' from data source `/Users/eriq/Documents/MapStuff/GISlayers_calw221_shp/calw221.shp' using driver `ESRI Shapefile'
## Simple feature collection with 7035 features and 38 fields
## geometry type: POLYGON
## dimension: XY
## bbox: xmin: -373985.5 ymin: -604500.1 xmax: 540085.8 ymax: 450043
## CRS: 3310
Plot it to make sure it is looking correct:
plot(salmon_bas)
Yep.
salmon_riv <- st_read("geo-spatial/California_Streams/California_Streams.shp") %>%
filter((str_detect(Hydrolog_1, "Klamath") & str_detect(Name, "Salmon")) |
Hydrolog_2 == 18010210 & str_detect(Name, "Salmon"))
## Reading layer `California_Streams' from data source `/Users/eriq/Documents/MapStuff/California_Streams/California_Streams.shp' using driver `ESRI Shapefile'
## Simple feature collection with 740662 features and 21 fields
## geometry type: LINESTRING
## dimension: XY
## bbox: xmin: -124.4291 ymin: 32.4911 xmax: -113.5302 ymax: 42.98283
## CRS: 4326
# this gets written out here
write_rds(salmon_riv, path = "stored_results/010/salmon_riv.rds", compress = "xz")
# if you couldn't get California_Streams, that is OK, just uncomment the following line:
# salmon_riv <- read_rds(path = "stored_results/010/salmon_riv.rds")
# plot it to make sure it looks right.
plot(st_geometry(salmon_riv))
Yep.
ggplot() +
geom_sf(data = salmon_bas) +
geom_sf(data = salmon_riv, aes(colour = Name))
Many of the carcasses are reported to have been sampled from one of a number of defined reaches in the river. Unfortunately, those reaches are not geo-referenced. So, we will have to use their descriptions from two fishery reports that are included in this repository:
reports/Meneks_2017-fall-run-survey.pdf
reports/Meneks_2017B-spring-survey.pdf
The data from both those tables was transcribed into data/salmon_river_reaches.csv
.
We will count up the river miles as we go according to the tables in the reports. To do this we write a function that takes a line string, blows it into points, grabs the parts that we want based on distance from a starting point, and then put them all back together into a line string. This function is in R/function_line_seg_by_distance.R
source("R/function_line_seg_by_distance.R")
Make a tibble has the hi and lo values for the reaches in miles, and then we can pick all of those out.
This block hangs, sometimes, when evaluated in the notebook. So, I’ve set eval=FALSE, and just have it read the result. But the code that generates it is all there.
reach_spec <- read_csv("data/salmon_river_reaches.csv") %>%
mutate(
reach_lo = units::set_units(reach_lo, mile),
reach_hi = units::set_units(reach_hi, mile)
)
# now join those onto the geometries and then chop each part of them
# out of the respective tribs/rivers
units::units_options(allow_mixed = TRUE)
reach_geos <- reach_spec %>%
left_join(salmon_riv, by = "Name") %>%
mutate(geometry2 = mclapply(1:n(),
function(i) line_seg_by_distance(geometry[i], reach_lo[[i]], reach_hi[[i]], asSFC = FALSE),
mc.cores = 8
))
survey_reaches <- reach_geos %>%
mutate(geometry = st_sfc(reach_geos$geometry2, crs = st_crs(reach_geos))) %>%
select(-geometry2)
write_rds(survey_reaches, path = "stored_results/010/survey_reaches.rds", compress = "xz")
survey_reaches <- read_rds(path = "stored_results/010/survey_reaches.rds")
When I put together the mileage chart I included reach 5, for example, as the union of 5A and 5B, but I left all of them (5, 5A, and 5B) in the data set. Now we must remove the non-numbered ones. Also, make a column that alternates between 0 and 1 as you go along the reaches.
surv_reaches_lettered <- survey_reaches %>%
filter(!(reach %in% c("4", "5", "6", "9", "10", "11"))) %>%
mutate(reach_alt = factor(1:n() %% 2))
st_geometry(surv_reaches_lettered) <- surv_reaches_lettered$geometry
st_crs(surv_reaches_lettered) <- st_crs(salmon_riv)
Plot this to see where the reaches are.
# get the centroids of each survey reach
centroids <- survey_reaches %>%
mutate(geometry = st_centroid(geometry))
g <- ggplot() +
geom_sf(data = salmon_bas) +
geom_sf(data = salmon_riv, colour = "gray") +
geom_sf(data = surv_reaches_lettered, aes(colour = reach_alt)) +
scale_colour_manual(values = c("red", "blue")) +
geom_sf_text(data = centroids, aes(geometry = geometry, label = reach)) +
theme_bw()
g
Read data in. Some individuals were genotyped more than once. We filter out duplicates of the same NMFS_DNA_ID, keeping the one that has the highest number of successfully genotyped SNPs in the RoSA. We discard all SNPs scored as “?”, retaining only those individuals with 2 or more SNPs remaining.
We also record whether the carcass was recorded with a REACH where it was found.
expando_rosa <- read_rds("data/salmon_river_spatiotemporal_rosa_data.rds") %>%
mutate(Year = year(COLLECTION_DATE)) %>%
mutate(
gcomp = str_remove_all(hapstr, "\\?"),
gcvec = strsplit(gcomp, ""),
called_geno = case_when(
nchar(gcomp) >= 2 & sapply(gcvec, function(x) all(x == "P")) ~ "S",
nchar(gcomp) >= 2 & sapply(gcvec, function(x) all(x == "M")) ~ "F",
nchar(gcomp) >= 2 & sapply(gcvec, function(x) all(x == "H")) ~ "H",
nchar(gcomp) >= 2 & (map_int(gcvec, n_distinct) >= 2) ~ "A",
TRUE ~ "M"
)
) %>%
mutate(RoSA_genotype = recode_factor(called_geno,
"S" = "EE",
"H" = "EL",
"F" = "LL",
"A" = "Ambiguous",
"M" = "Missing"
)) %>%
mutate(HasReach = (!is.na(REACH_SITE) & REACH_SITE != "Cecilville")) %>% # Cecilville is not a valid reach and all of those have a lat-long that we will use.
arrange(NMFS_DNA_ID, desc(nchar(gcomp))) %>%
group_by(NMFS_DNA_ID) %>%
mutate(rank = 1:n()) %>%
filter(rank == 1) %>%
ungroup() %>%
select(-rank)
Read in extra meta-data that goes with those, so we can do lats and longs if they have them.
expando_meta <- read_csv("data/SalmonRiver_RoSA_genotype_full_table_meta.csv") %>%
select(NMFS_DNA_ID, LATITUDE_F, LONGITUDE_F) %>%
rename(
Lat = LATITUDE_F,
Long = LONGITUDE_F
) %>%
mutate(HasLatLong = ifelse(is.na(Lat), FALSE, TRUE))
Join on the lat-longs
expando_r_latlong <- expando_meta %>%
left_join(expando_rosa, .) %>%
filter(!is.na(Year))
See how many have at least one of reach or lat/long:
expando_r_latlong %>%
filter(HasReach == TRUE | HasLatLong == TRUE)
Take stock of what we’ve got by year, location, and genotype. And present that as a table.
# summarise counts
yprg <- expando_r_latlong %>%
count(Year, HasReach, HasLatLong, RoSA_genotype)
# spread over genotype
yprg_s <- yprg %>%
spread(RoSA_genotype, n, fill = 0) %>%
mutate(`Total n` = EE + EL + LL + Ambiguous + Missing)
# save that to a file:
write_csv(yprg_s, path = "outputs/010/SalmonRiver_RoSA_genotype_counts.csv")
The table output above is Table S7 in the paper.
Given what we see in the Location Comments Field (i.e. sometimes the lat-long is the location of a nearby landmark, possibly not near the river itself), it will be best to use the reach to identify position if it is available, unless it just says “Cecilville” as that is not a valid reach specifer (those indivs have lat-longs, though). If reach is not available, and the Lat/Long is available, then we will use the Lat/Long.
Here is a function to randomly sample a point from a linestring:
rando_pt_from_linestring <- function(L) {
lapply(L, function(x) {
mat <- unclass(x)
r <- sample(1:nrow(mat), 1)
x[r, ]
})
}
Now…
# these have REACH
has_R <- expando_r_latlong %>%
filter(HasReach == TRUE)
# these don't
no_R_LL <- expando_r_latlong %>%
filter(HasReach == FALSE, HasLatLong == TRUE) %>%
mutate(rLat = Lat, rLong = Long) # populate the rLat and rLong fields with the same values in these cases
set.seed(100)
tmp <- has_R %>%
mutate(
reach = str_replace_all(REACH_SITE, "[Rr]each ", ""),
reach = ifelse(nchar(reach) < 5, reach, NA)
) %>% # the wipes out the Cecilvilles...they all have LatLong info.
left_join(survey_reaches, by = "reach")
# grab the five fish in reach 1. We have a lat-long location for that and we will
# just manually put the Reach 1 fish in there at that spot, and then wiggle it.
Reach1_fish <- tmp %>%
filter(reach == 1) %>%
mutate(
rLat = 41.377819,
rLong = -123.430
)
has_R_LL <- tmp %>%
filter(sapply(geometry, function(x) nrow(x) > 0)) %>% # some geometries were missing
mutate(rando_point = rando_pt_from_linestring(geometry)) %>%
mutate(
rLong = map_dbl(rando_point, function(x) x["X"]),
rLat = map_dbl(rando_point, function(x) x["Y"])
)
# now pick out from that only the non-geometry columns
has_R_LL_pick <- has_R_LL[, c(names(no_R_LL), "rLat", "rLong")]
Reach1_fish_pick <- Reach1_fish[, c(names(no_R_LL), "rLat", "rLong")]
LL_by_reach_or_coords <- bind_rows(
no_R_LL,
has_R_LL_pick,
Reach1_fish_pick
)
Now we have fish with decent Lat-Longs, sampled in the Salmon River from 1994 to 2018. Let’s see where the RoSA genotypes fall out, and we will include ambiguous and missing in here too.
We will jitter them, so we include seed.
set.seed(5)
m9 <- ggplot() +
geom_sf(data = salmon_bas, fill = "#e5f5e0") +
geom_sf(data = salmon_riv, colour = "gray") +
scale_colour_manual(values = c("#a6bddb", "black")) +
theme_bw() +
geom_jitter(
data = LL_by_reach_or_coords, aes(x = rLong, y = rLat, fill = RoSA_genotype), shape = 21, stroke = 0.3, colour = "black",
size = 2.5, width = .02, height = .02
) +
geom_sf(data = surv_reaches_lettered, aes(geometry = geometry, colour = reach_alt), show.legend = FALSE) +
scale_fill_manual(values = c(
`LL` = "blue",
`EE` = "gold",
`EL` = "tan2",
`Ambiguous` = "red",
`Missing` = "black"
)) +
facet_wrap(~RoSA_genotype, ncol = 2) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
xlab("Longitude") +
ylab("Latitude") +
labs(fill = "RoSA Genotype")
ggsave(m9, filename = "outputs/010/salmon-river-map-all-years-faceted-by-geno.pdf", width = 7, height = 10)
2006 has sufficient fish to make it worthwhile to break the carcass recoveries down by date.
alls_2006 <- LL_by_reach_or_coords %>%
filter(Year == 2006 & !(RoSA_genotype %in% c("Missing", "Ambiguous")))
# we can carve up the sample dates into 14-day blocks
date_ints <- seq(from = ymd("2006-9-18"), to = ymd("2006-12-14"), by = 14)
# and we will name those intervals according to the middle day in each
dint_names <- date_ints[-1] - 7
alls_2006_weeked <- alls_2006 %>%
mutate(mid_week_day = cut(COLLECTION_DATE, breaks = date_ints, labels = dint_names))
Now, with that, we can facet our way down the weeks:
set.seed(5)
m15 <- ggplot() +
geom_sf(data = salmon_bas, fill = "#e5f5e0") +
geom_sf(data = salmon_riv, colour = "gray") +
scale_colour_manual(values = c("#a6bddb", "black")) +
theme_bw() +
geom_jitter(
data = alls_2006_weeked, aes(x = rLong, y = rLat, fill = RoSA_genotype), shape = 21, stroke = 0.3, colour = "black",
size = 2.5, width = .02, height = .02
) +
geom_sf(data = surv_reaches_lettered, aes(geometry = geometry, colour = reach_alt), show.legend = FALSE) +
scale_fill_manual(values = c(
`LL` = "blue",
`EE` = "gold",
`EL` = "tan2"
)) +
facet_grid(RoSA_genotype ~ mid_week_day) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
xlab("Longitude") +
ylab("Latitude") +
labs(fill = "RoSA Genotype")
ggsave(m15, filename = "outputs/010/salmon-river-spatiotemporal-map-2006.pdf", width = 12, height = 8)
From the above assessment, it appears that some years have too few successfully genotyped fish to be very informative. So, we will filter down to only those years that have at least 20 successful genotypes. We could do some thing where we estimate frequencies given counts of the ambiguous and missing ones as well, but I feel that is overkill. Missingness is unlikely correlated with RoSA genotype. So, we will filter out the missing and ambiguous ones, and do the analysis on that.
for_hwe <- yprg %>%
ungroup() %>%
filter(RoSA_genotype %in% c("LL", "EL", "EE")) %>%
mutate(RoSA_genotype = fct_drop(RoSA_genotype)) %>%
group_by(Year) %>%
filter(sum(n) >= 20)
# then lump all the "populations" together in each year and compute observed frequencies
# as well as expected frequencies under HWE
fhwe_tots <- for_hwe %>%
group_by(Year, RoSA_genotype, .drop = FALSE) %>% # .drop = FALSE to get explicit 0's
summarise(N = sum(n)) %>%
mutate(
freq = N / sum(N),
NS = 2 * N[RoSA_genotype == "EE"] + N[RoSA_genotype == "EL"],
NF = 2 * N[RoSA_genotype == "LL"] + N[RoSA_genotype == "EL"],
fS = NS / (NS + NF),
fF = NF / (NS + NF)
) %>%
mutate(hwe_exp_freq = c(fS[1]^2, 2 * fF[1] * fS[1], fF[1]^2)) %>%
mutate(hwe_exp_n = hwe_exp_freq * sum(N))
Now, make a table of the relevant numbers here:
hwetab <- fhwe_tots %>%
select(Year, RoSA_genotype, N, hwe_exp_n)
hwetab
And now let’s make a figure. Hey! 2007 was a year in which we only got samples from the spring-run survey, so let’s drop that year. Also, year 2014 has almost no EEs, so it isn’t particularly informative comparison.
hwefig <- ggplot(hwetab %>% filter(Year > 1997 & Year != 2014)) +
geom_col(aes(x = RoSA_genotype, y = N, fill = RoSA_genotype)) +
geom_col(aes(x = RoSA_genotype, y = hwe_exp_n), fill = NA, colour = "green", size = 0.5) +
geom_col(aes(x = RoSA_genotype, y = hwe_exp_n),
fill = NA, colour = "black", size = 0.5,
linetype = "dashed"
) +
scale_fill_manual(values = c(
`LL` = "blue",
`EE` = "gold",
`EL` = "tan2"
)) +
facet_wrap(~Year, nrow = 1) +
theme_bw()
ggsave(hwefig, filename = "outputs/010/salmon-river_hwe_counts-3-panel.pdf", width = 8, height = 3)
sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 3.6.2 (2019-12-12)
## os macOS Sierra 10.12.6
## system x86_64, darwin15.6.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/Denver
## date 2020-05-14
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date lib source
## assertthat 0.2.1 2019-03-21 [1] CRAN (R 3.6.0)
## backports 1.1.6 2020-04-05 [1] CRAN (R 3.6.2)
## broom 0.5.6 2020-04-20 [1] CRAN (R 3.6.2)
## cellranger 1.1.0 2016-07-27 [1] CRAN (R 3.6.0)
## class 7.3-15 2019-01-01 [2] CRAN (R 3.6.2)
## classInt 0.4-3 2020-04-07 [1] CRAN (R 3.6.2)
## cli 2.0.2 2020-02-28 [1] CRAN (R 3.6.0)
## colorspace 1.4-1 2019-03-18 [1] CRAN (R 3.6.0)
## crayon 1.3.4 2017-09-16 [1] CRAN (R 3.6.0)
## DBI 1.1.0 2019-12-15 [1] CRAN (R 3.6.0)
## dbplyr 1.4.3 2020-04-19 [1] CRAN (R 3.6.2)
## digest 0.6.25 2020-02-23 [1] CRAN (R 3.6.0)
## dplyr * 0.8.5 2020-03-07 [1] CRAN (R 3.6.0)
## e1071 1.7-3 2019-11-26 [1] CRAN (R 3.6.0)
## ellipsis 0.3.0 2019-09-20 [1] CRAN (R 3.6.0)
## evaluate 0.14 2019-05-28 [1] CRAN (R 3.6.0)
## fansi 0.4.1 2020-01-08 [1] CRAN (R 3.6.0)
## farver 2.0.3 2020-01-16 [1] CRAN (R 3.6.0)
## forcats * 0.5.0 2020-03-01 [1] CRAN (R 3.6.0)
## fs 1.4.1 2020-04-04 [1] CRAN (R 3.6.2)
## generics 0.0.2 2018-11-29 [1] CRAN (R 3.6.0)
## ggplot2 * 3.3.0 2020-03-05 [1] CRAN (R 3.6.0)
## glue 1.4.0 2020-04-03 [1] CRAN (R 3.6.2)
## gtable 0.3.0 2019-03-25 [1] CRAN (R 3.6.0)
## haven 2.2.0 2019-11-08 [1] CRAN (R 3.6.0)
## hms 0.5.3 2020-01-08 [1] CRAN (R 3.6.0)
## htmltools 0.4.0 2019-10-04 [1] CRAN (R 3.6.0)
## httr 1.4.1 2019-08-05 [1] CRAN (R 3.6.0)
## jsonlite 1.6.1 2020-02-02 [1] CRAN (R 3.6.0)
## KernSmooth 2.23-16 2019-10-15 [2] CRAN (R 3.6.2)
## knitr 1.28 2020-02-06 [1] CRAN (R 3.6.0)
## labeling 0.3 2014-08-23 [1] CRAN (R 3.6.0)
## lattice 0.20-38 2018-11-04 [2] CRAN (R 3.6.2)
## lifecycle 0.2.0 2020-03-06 [1] CRAN (R 3.6.0)
## lubridate * 1.7.8 2020-04-06 [1] CRAN (R 3.6.2)
## magrittr 1.5 2014-11-22 [1] CRAN (R 3.6.0)
## modelr 0.1.6 2020-02-22 [1] CRAN (R 3.6.0)
## munsell 0.5.0 2018-06-12 [1] CRAN (R 3.6.0)
## nlme 3.1-142 2019-11-07 [2] CRAN (R 3.6.2)
## pillar 1.4.3 2019-12-20 [1] CRAN (R 3.6.0)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 3.6.0)
## purrr * 0.3.4 2020-04-17 [1] CRAN (R 3.6.2)
## R6 2.4.1 2019-11-12 [1] CRAN (R 3.6.0)
## Rcpp 1.0.4 2020-03-17 [1] CRAN (R 3.6.0)
## readr * 1.3.1 2018-12-21 [1] CRAN (R 3.6.0)
## readxl 1.3.1 2019-03-13 [1] CRAN (R 3.6.0)
## reprex 0.3.0 2019-05-16 [1] CRAN (R 3.6.0)
## rlang 0.4.5 2020-03-01 [1] CRAN (R 3.6.0)
## rmarkdown 2.1 2020-01-20 [1] CRAN (R 3.6.0)
## rstudioapi 0.11 2020-02-07 [1] CRAN (R 3.6.0)
## rvest 0.3.5 2019-11-08 [1] CRAN (R 3.6.0)
## scales 1.1.0 2019-11-18 [1] CRAN (R 3.6.0)
## sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 3.6.0)
## sf * 0.9-2 2020-04-14 [1] CRAN (R 3.6.2)
## stringi 1.4.6 2020-02-17 [1] CRAN (R 3.6.0)
## stringr * 1.4.0 2019-02-10 [1] CRAN (R 3.6.0)
## tibble * 3.0.1 2020-04-20 [1] CRAN (R 3.6.2)
## tidyr * 1.0.2 2020-01-24 [1] CRAN (R 3.6.0)
## tidyselect 1.0.0 2020-01-27 [1] CRAN (R 3.6.0)
## tidyverse * 1.3.0 2019-11-21 [1] CRAN (R 3.6.0)
## units 0.6-6 2020-03-16 [1] CRAN (R 3.6.0)
## vctrs 0.2.4 2020-03-10 [1] CRAN (R 3.6.0)
## withr 2.2.0 2020-04-20 [1] CRAN (R 3.6.2)
## xfun 0.13 2020-04-13 [1] CRAN (R 3.6.2)
## xml2 1.3.2 2020-04-23 [1] CRAN (R 3.6.2)
## yaml 2.2.1 2020-02-01 [1] CRAN (R 3.6.0)
##
## [1] /Users/eriq/Library/R/3.6/library
## [2] /Library/Frameworks/R.framework/Versions/3.6/Resources/library
Running the code and rendering this notebook required approximately this much time on a Mac laptop of middling speed:
Sys.time() - start_time
## Time difference of 50.76669 secs