Objectives

Dataset Combined Till99 & Yaq98 Ferraro Benthic Macrofauna.xlsx from Christina Folger Folger.Christina@epa.gov.

Based on OBIS - Manual - Darwin Core, at a minimum we need to:

  1. Get latitude & longitude from UTM coordinates.
  2. Get WoRMS ID from taxonomic name.
  3. Create table with required DarwinCore columns.

This document is an Rmarkdown HTML file, which knits chunks of R code with formatted text (in markdown format) to create a reproducible document using data:

You can “Hide” or expand “Code” by clicking the right side buttons, or “Hide All Code” / “Show All Code” by clicking the “Code” dropdown in the upper right.

You can download a zip of this repository, including the source Rmarkdown (index.Rmd) for this document (index.html) here:

# packages
library(tidyverse)
library(fs)
library(here)
library(readxl)
library(DT)
library(sf)
library(mapview)
library(taxize)
library(glue)

# paths
d_xlsx       <- "data/raw/Combined Till99 & Yaq98 Ferraro Benthic Macrofauna.xlsx"
stations_csv <- "data/stations.csv"
taxa_csv     <- "data/taxa.csv"
dwc_csv      <- "data/dwc.csv"

Read from Excel

d <- read_xlsx(
  d_xlsx, sheet = 1,
  # Expecting logical in L1299 / R1299C12: got 'Polychaeta'
  col_types = c(rep("guess", 11), "text", "guess", "guess"))

# class(d)
# summary(spp_station)
# View(spp_station)

datatable(d)

Show Stations on a Map

# search UTM 10N at spatialreference.org
stations <- d %>%
  group_by(StationID, State, Estuary, Habitat, Station, Northing, Easting) %>%
  summarize() %>%
  #  nest()
  st_as_sf(coords = c("Easting", "Northing"), crs = 32610) %>%
  st_transform(crs=4326) %>%
  mutate(
    lon = st_coordinates(geometry)[,"X"],
    lat = st_coordinates(geometry)[,"Y"])

write_csv(stations, stations_csv)

mapview(stations)

Match Species to WoRMS ID

# get_wormsid() is time consuming, so cache output
if (!file_exists(taxa_csv)){

  taxa <- d %>%
    group_by(GenusSpecies) %>%
    summarize() %>%
    #mutate(
    #  wormsid  = get_wormsid(GenusSpecies))
    mutate(
      query = str_replace(GenusSpecies, " sp", "")) %>%
    mutate(
      wormsid  = get_wormsid(GenusSpecies, ask = FALSE))  
  
  write_csv(taxa, taxa_csv)
} else {
  taxa <- read_csv(taxa_csv, col_type = cols())
}

#taxa$wormsid <- get_wormsid(taxa$GenusSpecies)
#taxa$wormsid <- get_wormsid(taxa$query)
#taxa$wormsid <- get_wormsid(taxa$query, ask = FALSE)

#table(!is.na(taxa$wormsid))
taxa %>%
  arrange(!is.na(wormsid), GenusSpecies) %>%
  datatable()

Format Table for Darwin Core

#table(d$Sieve_mm)
#table(d$Core_area_sq_m)

obs <- d %>%
  select(StationID, DateSampled, GenusSpecies, SumOfCount) %>%
  left_join(
    stations %>%
      select(StationID, lon, lat), 
    by="StationID") %>%
  left_join(
    taxa %>%
      select(GenusSpecies, wormsid), 
    by="GenusSpecies") %>%
  filter(!is.na(wormsid)) # n: 8,953 -> 4,667

dwc <- obs %>%
  arrange(DateSampled, StationID, GenusSpecies) %>%
  mutate(
    institutionCode = "EPA",
    collectionCode  = "Combined Till99 & Yaq98 Ferraro Benthic Macrofauna",
    catalogNumber   = row_number(),
    occurrenceID    = glue("{institutionCode} - {collectionCode} - {catalogNumber}"), 
    eventDate       = format(DateSampled, "%y-%m-%d T%h:%m:%s %z"),
    occurrenceStatus = "present",
    basisOfRecord = "HumanObservation") %>%
  select(
    # id
    institutionCode, collectionCode, catalogNumber, occurrenceID,
    # when
    eventDate, 
    # where
    decimalLongitude = lon, decimalLatitude = lat,
    # what
    scientificName = GenusSpecies, scientificNameID = wormsid,
    occurrenceStatus, basisOfRecord)

write_csv(dwc, dwc_csv)
#datatable(obs)
dwc %>%
  head(100) %>%
  datatable()