Dataset Combined Till99 & Yaq98 Ferraro Benthic Macrofauna.xlsx
from Christina Folger Folger.Christina@epa.gov.
Based on OBIS - Manual - Darwin Core, at a minimum we need to:
This document is an Rmarkdown HTML file, which knits chunks of R code with formatted text (in markdown format) to create a reproducible document using data:
You can “Hide” or expand “Code” by clicking the right side buttons, or “Hide All Code” / “Show All Code” by clicking the “Code” dropdown in the upper right.
You can download a zip of this repository, including the source Rmarkdown (index.Rmd
) for this document (index.html
) here:
# packages
library(tidyverse)
library(fs)
library(here)
library(readxl)
library(DT)
library(sf)
library(mapview)
library(taxize)
library(glue)
# paths
d_xlsx <- "data/raw/Combined Till99 & Yaq98 Ferraro Benthic Macrofauna.xlsx"
stations_csv <- "data/stations.csv"
taxa_csv <- "data/taxa.csv"
dwc_csv <- "data/dwc.csv"
d <- read_xlsx(
d_xlsx, sheet = 1,
# Expecting logical in L1299 / R1299C12: got 'Polychaeta'
col_types = c(rep("guess", 11), "text", "guess", "guess"))
# class(d)
# summary(spp_station)
# View(spp_station)
datatable(d)
# search UTM 10N at spatialreference.org
stations <- d %>%
group_by(StationID, State, Estuary, Habitat, Station, Northing, Easting) %>%
summarize() %>%
# nest()
st_as_sf(coords = c("Easting", "Northing"), crs = 32610) %>%
st_transform(crs=4326) %>%
mutate(
lon = st_coordinates(geometry)[,"X"],
lat = st_coordinates(geometry)[,"Y"])
write_csv(stations, stations_csv)
mapview(stations)
# get_wormsid() is time consuming, so cache output
if (!file_exists(taxa_csv)){
taxa <- d %>%
group_by(GenusSpecies) %>%
summarize() %>%
#mutate(
# wormsid = get_wormsid(GenusSpecies))
mutate(
query = str_replace(GenusSpecies, " sp", "")) %>%
mutate(
wormsid = get_wormsid(GenusSpecies, ask = FALSE))
write_csv(taxa, taxa_csv)
} else {
taxa <- read_csv(taxa_csv, col_type = cols())
}
#taxa$wormsid <- get_wormsid(taxa$GenusSpecies)
#taxa$wormsid <- get_wormsid(taxa$query)
#taxa$wormsid <- get_wormsid(taxa$query, ask = FALSE)
#table(!is.na(taxa$wormsid))
taxa %>%
arrange(!is.na(wormsid), GenusSpecies) %>%
datatable()
#table(d$Sieve_mm)
#table(d$Core_area_sq_m)
obs <- d %>%
select(StationID, DateSampled, GenusSpecies, SumOfCount) %>%
left_join(
stations %>%
select(StationID, lon, lat),
by="StationID") %>%
left_join(
taxa %>%
select(GenusSpecies, wormsid),
by="GenusSpecies") %>%
filter(!is.na(wormsid)) # n: 8,953 -> 4,667
dwc <- obs %>%
arrange(DateSampled, StationID, GenusSpecies) %>%
mutate(
institutionCode = "EPA",
collectionCode = "Combined Till99 & Yaq98 Ferraro Benthic Macrofauna",
catalogNumber = row_number(),
occurrenceID = glue("{institutionCode} - {collectionCode} - {catalogNumber}"),
eventDate = format(DateSampled, "%y-%m-%d T%h:%m:%s %z"),
occurrenceStatus = "present",
basisOfRecord = "HumanObservation") %>%
select(
# id
institutionCode, collectionCode, catalogNumber, occurrenceID,
# when
eventDate,
# where
decimalLongitude = lon, decimalLatitude = lat,
# what
scientificName = GenusSpecies, scientificNameID = wormsid,
occurrenceStatus, basisOfRecord)
write_csv(dwc, dwc_csv)
#datatable(obs)
dwc %>%
head(100) %>%
datatable()