prep

  • load libraries
# load libraries
suppressPackageStartupMessages({
  library(raster)
  library(rgeos)
  library(readr)
  library(tibble)
  library(dplyr)
  library(tidyr)
  library(ggplot2)
  library(sp)
  library(rgdal)
  library(ncdf4)
  library(scales)
  library(robis)
  library(DT)
  select = dplyr::select
})

hist by depth

Before filtering, summarize records by 5 decimal degrees at different depth bins.

# files
obis_rdata  = 'data/obis.dat'

# load data provided by Peter Provoost (2016-10-03)
load(obis_rdata)

# summarize data by 5 deg bins and depth
d_sum <- data %>%
  filter(
    latitude <= 90 | latitude >= -90,
    !is.na(depth)) %>%
  mutate(
    zone = cut(depth, c(0, 20, 200, Inf), labels=c('< 20', '20 - 200', '> 200')),
    band = round(latitude / 5) * 5) %>%
  group_by(band, zone) %>%
  summarize(
    records = n())

# histogram of depth by latitude
ggplot(d_sum, aes(x = band, y = records, fill = zone)) +
  geom_bar(stat = 'identity') +
  scale_fill_brewer(palette='YlGnBu', name='zone (m)') +
  scale_y_continuous(expand = c(0, 0)) +
  labs(x='latitude')

land

Using Land - 1:10m Physical Vectors | Natural Earth, (3.26 MB) version 3.0.1.

Derived from 10m coastline. Continental polygons broken into smaller, contiguous pieces to avoid having too many points in any one polygon, facilitating faster data processing in certain software applications. (below) Yucatan peninsula, Cuba, and Hispaniola.

land_rds = 'data/ne_10m_land.rds'

# projections
crs_mol = '+proj=moll +lon_0=0 +x_0=0 +y_0=0 +ellps=WGS84 +units=m +no_defs'
crs_gcs = '+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0'

if (!file.exists(land_rds)){

  # download land from NE (Natural Earth)
  land_zip_url = 'http://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/physical/ne_10m_land.zip'
  land_zip = tempfile(fileext='.zip')
  land_dir = tempdir()
  download.file(land_zip_url, land_zip)
  unzip(land_zip, exdir=land_dir)

  # read in as SpatialPolygonsDataFrame
  land     = readOGR(dsn=land_dir, layer='ne_10m_land')

  # project to Mollweide
  #land_mol = spTransform(land, CRS(crs_mol))

  # save
  saveRDS(land, land_rds)
} else {
  land = readRDS(land_rds)
}

depth

The bathymetric depth comes from the GEBCO 30 arc-second grid published in 2014. Here’s there requested attribution:

GEBCO_2014 Grid, version 20150318, www.gebco.net

pts

For the OBIS points, extract depth of locations and whether on land.

# files
depth_nc    = 'data/GEBCO_2014_2D.nc'
obis_rdata  = 'data/obis.dat'
pts_all_rds = 'data/pts_all.rds'

if (!file.exists(pts_all_rds)){

  # load data provided by Peter Provoost (2016-10-03)
  load(obis_rdata)

  # spatial points
  pts = data %>%
    add_rownames() %>%
    rename(lon = longitude, lat=latitude) %>%
    mutate(
      lon = ifelse(lon < -180, 360 + lon, lon),
      x = lon,
      y = lat)
  coordinates(pts) = ~x+y
  proj4string(pts) = CRS(crs_gcs)

  # load depth
  depth_r = raster(depth_nc, layer = 'elevation')

  # extract bottom depth (~2 min)
  system.time({
    pts = raster::extract(depth_r, pts, method='bilinear', sp=T)}) # 149.559

  # determine if on land ( min)
  #   try speeding up by chunking into sorted lon & lat
  land_parts = sp::disaggregate(land)
  system.time({
    t0 = Sys.time()
    pts_land = numeric(length(pts))
    x = round(seq(1, length(pts), length.out = 1001))
    for (i in 1:1000){ # i=500
      ix = x[i]:x[i+1]
      pts_land[ix] = sp::over(pts[ix,], land_parts)$featurecla

      dt = difftime(Sys.time(), t0, units='hours')
      cat(sprintf('%04.1f%% %0.2f hrs to go\n', i/10, dt / i * (1000 - i)))
    }
  }) # elapsed: ~1 hr
  pts$land = !is.na(pts_land)

  # update data frame
  pts@data = pts@data %>%
    rename(
      sample_depth = depth,
      bottom_depth = Elevation.relative.to.sea.level) %>%
    mutate(
      bottom_depth = bottom_depth * -1,
      bottom_zone  = cut(
        bottom_depth, c(-Inf, -0.01, 5, Inf), labels=c('<0', '0 - 5', '>5'),
        include.lowest=T, right=T))

  # project to Mollweide, for hist matching map
  pts_mol = spTransform(
    pts,
    CRS(crs_mol))
  xy_mol = coordinates(pts_mol) %>% as.data.frame()
  pts@data = bind_cols(
    pts@data,
    data.frame(
      x_mol = xy_mol$x,
      y_mol = xy_mol$y))
  
  # save to pts_all
  saveRDS(pts, pts_all_rds)
} else {
  pts = readRDS(pts_all_rds)
}

Here’s a summary of the points before filtering:

# show summary of values with NA's
summary(pts) # n=4,234,200
## Object of class SpatialPointsDataFrame
## Coordinates:
##    min max
## x -180 180
## y  -90  90
## Is projected: FALSE 
## proj4string :
## [+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0]
## Number of points: 4234200
## Data attributes:
##       lon               lat          sample_depth           year       
##  Min.   :-180.00   Min.   :-90.00   Min.   : -9999.0   Min.   :1520    
##  1st Qu.: -89.55   1st Qu.: -8.99   1st Qu.:     7.5   1st Qu.:1984    
##  Median : -55.70   Median : 38.50   Median :    30.0   Median :1996    
##  Mean   : -29.41   Mean   : 20.03   Mean   :   192.7   Mean   :1991    
##  3rd Qu.:  10.30   3rd Qu.: 48.62   3rd Qu.:   123.0   3rd Qu.:2003    
##  Max.   : 180.00   Max.   : 90.00   Max.   :623624.0   Max.   :2016    
##                                     NA's   :2243109    NA's   :268362  
##     records          bottom_depth         land         bottom_zone    
##  Min.   :    1.00   Min.   :-5668.40   Mode :logical   <0   : 593037  
##  1st Qu.:    1.00   1st Qu.:   17.88   FALSE:3806354   0 - 5: 227559  
##  Median :    2.00   Median :  119.50   TRUE :427846    >5   :3413142  
##  Mean   :   11.12   Mean   :  918.27   NA's :0         NA's :    462  
##  3rd Qu.:    7.00   3rd Qu.: 1210.75                                  
##  Max.   :33764.00   Max.   :10887.49                                  
##                     NA's   :462                                       
##      x_mol               y_mol           right        
##  Min.   :-18035560   Min.   :-9020048   Mode:logical  
##  1st Qu.: -8134156   1st Qu.:-1109819   TRUE:4234200  
##  Median : -3964421   Median : 4621056   NA's:0        
##  Mean   : -2462753   Mean   : 2374513                 
##  3rd Qu.:   708197   3rd Qu.: 5728304                 
##  Max.   : 18040096   Max.   : 9020048                 
## 

filter pts

# filter spatial data, using subset (not dplyr::filter)
pts = subset(
  pts,
  !is.na(lon) & !is.na(lat) &                  # lon,lat
  !is.na(year) &                               # year
  sample_depth <= 20 & !is.na(sample_depth) &  # sample_depth
  !land &                                      # land
  bottom_zone %in% c('0 - 5', '>5'))           # bottom_zone

And after filtering:

# summarize points
summary(pts) # n=875,472
## Object of class SpatialPointsDataFrame
## Coordinates:
##         min      max
## x -180.0000 179.9998
## y  -78.0498  90.0000
## Is projected: FALSE 
## proj4string :
## [+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0]
## Number of points: 761731
## Data attributes:
##       lon                lat          sample_depth         year     
##  Min.   :-180.000   Min.   :-78.05   Min.   :-999.9   Min.   :1754  
##  1st Qu.: -70.150   1st Qu.:-34.92   1st Qu.:   0.0   1st Qu.:1982  
##  Median :  -9.458   Median : 38.91   Median :   7.5   Median :1997  
##  Mean   :  -7.503   Mean   : 16.10   Mean   :   5.9   Mean   :1990  
##  3rd Qu.:  33.510   3rd Qu.: 53.15   3rd Qu.:   7.5   3rd Qu.:2002  
##  Max.   : 180.000   Max.   : 90.00   Max.   :  20.0   Max.   :2014  
##     records         bottom_depth         land         bottom_zone   
##  Min.   :   1.00   Min.   :  -0.008   Mode :logical   <0   :     0  
##  1st Qu.:   2.00   1st Qu.:  26.000   FALSE:761731    0 - 5: 86862  
##  Median :   6.00   Median : 175.934   NA's :0         >5   :674869  
##  Mean   :  14.29   Mean   :1138.200                                 
##  3rd Qu.:  14.00   3rd Qu.:2084.878                                 
##  Max.   :5735.00   Max.   :9329.469                                 
##      x_mol               y_mol           right        
##  Min.   :-17976910   Min.   :-8394926   Mode:logical  
##  1st Qu.: -5663888   1st Qu.:-4213669   TRUE:761731   
##  Median :  -708287   Median : 4667203   NA's:0        
##  Mean   : -1001721   Mean   : 1928800                 
##  3rd Qu.:  2466263   3rd Qu.: 6199229                 
##  Max.   : 18039794   Max.   : 9020048

map

Summarize filtered points and project to Mollweide for more area-realistic map.

rasterize pts

Get density of records by 1 degree cell by summing the field records from points within each cell and dividing by the cell’s area.

# setup reference raster for global 1 deg cells
r = raster(extent(-180, 180, -90, 90), ncols=360, nrows=180, crs=crs_gcs)

# calculate number of records per reference 1 deg raster
#  rasterize: points that fall on a border between cells are placed
#             in the cell to the right and/or in the cell below
r_n = rasterize(pts, r, 'records', fun=sum) # plot(r_n)

# calculate area
r_km2 = area(r)                             # plot(r_km2)

# calculate density: number of records per km2
r_n_km2 = r_n / r_km2                       # plot(r_n_km2)

plot(r_n_km2)

map mollweide

Plot the global map in Mollweide projection.

world = map_data('world')

# get max value for plotting time-series
max_i  = which.max(r_n_km2)
max_xy = xyFromCell(r_n_km2, max_i)
max_v  = r_n_km2[max_i] # 90.86918

m_pdf = 'fig/obis_map.pdf'
m = rasterVis::gplot(r_n_km2) +
  geom_rect(
    aes(xmin=x_min, xmax=x_max, ymin=y_min, ymax=y_max),
    data=data.frame(x_min=-180, x_max=180, y_min=-90, y_max=90),
    colour='black', fill='white', size=0.3, inherit.aes=F) +
  geom_tile(aes(fill=log10(value))) + # , colour = NA
  scale_fill_distiller(
    type='div', palette='Spectral',
    name='log10(records / km2)', na.value='white') +
  geom_map(
    data=world, map=world, aes(x=long, y=lat, map_id=region), fill = 'grey70') +
  geom_point(
    aes(x=x, y=y), as.data.frame(max_xy), color='red') +
  coord_map("mollweide", xlim=c(-180, 180)) +
  ylab('latitude') +
  theme(
    panel.background=element_blank(),
    axis.text.x=element_blank(), axis.ticks.x=element_blank(), # x ticks
    axis.title.x=element_blank(),                              # x title
    panel.border=element_blank(),                              # panel
    legend.position='bottom')

# print to pdf
pdf(m_pdf)
print(m)
invisible(dev.off())
system(sprintf('open %s', m_pdf))

# output
m

datasets for densest cell

The most dense cell (center at -80.5, 25.5)

occ_csv          = 'data/occurrences.csv'
occ_datasets_csv = 'data/occurrence_datasets.csv'

xy_wkt = function(x,y){
  sprintf(
    'POLYGON((%g %g,%g %g,%g %g,%g %g,%g %g))',
    x-0.5, y-0.5, x+0.5, y-0.5, x+0.5, y+0.5, x-0.5, y+0.5, x-0.5, y-0.5)
}
bb_wkt = xy_wkt(x=max_xy[1], y=max_xy[2])

if (!file.exists(occ_datasets_csv)){

  # retrieve occurrences for bbox from OBIS
  system.time({
    occ = occurrence(geometry=bb_wkt)  # 1,051,906 records; ~ 1.5 hrs
  })
  write_csv(occ, occ_csv)              # 621.6 MB
  
  # summarize
  occ_datasets =
    # dataset, year
    occ %>%
    filter(yearcollected >= 2000) %>%
    group_by(datasetName, yearcollected) %>%
    summarize(
      records = n()) %>%
    spread(yearcollected, records) %>%
    # dataset, records since 2000
    left_join(
      occ %>%
      filter(yearcollected >= 2000) %>%
      group_by(datasetName) %>%
      summarize(since2000 = n()),
      by='datasetName') %>%
    # dataset, records since 2000
    left_join(
      occ %>%
      group_by(datasetName) %>%
      summarize(all = n()),
      by='datasetName') %>%
    # sort by biggest
    arrange(desc(all), datasetName)
    
  write_csv(occ_datasets, occ_datasets_csv, na='')

} else {
  occ_datasets = read_csv(occ_datasets_csv)
}

# table
occ_datasets %>%
  arrange(desc(all), datasetName) %>%
  datatable() %>%
  formatCurrency(2:ncol(occ_datasets), currency='', interval=3, mark=',', digits=0)

TODO: - Consider proportion of area available to coastal vs pelagic, after removing land and accounting for latitudinal variation.

hist by latitude

Histogram of records using the same vertical axes as the Mollweide map for manually aligning after.

# bin records into 10 deg latitudinal bands
lats = seq(-85, 85, by=10)
h_d = data_frame(
  lat_band     = rep(lats, each=2),
  bottom_zone  = rep(c('0 - 5','>5'), length(lats))) %>%
  left_join(
    pts@data %>%
    mutate(
      lat_band = cut(lat, seq(-90, 90, by=10), labels = lats, include.lowest=T, right=T)) %>%
    group_by(lat_band, bottom_zone) %>%
    summarize(
      records = sum(records)) %>%
    ungroup() %>%
    mutate(
      lat_band    = as.numeric(as.character(lat_band)),
      bottom_zone = as.character(bottom_zone)) %>%
    filter(bottom_zone %in% c('0 - 5','>5')),
    by=c('lat_band','bottom_zone')) %>%
  mutate(
    records = ifelse(is.na(records), 0, records))
    
# get mollweide latitudinal coordinates
p = data_frame(lat = c(-90,90,seq(-90, 90, by=10),lats), lon = 0)
coordinates(p) = ~lon+lat
proj4string(p) = CRS(crs_gcs)            
m = spTransform(p, CRS(crs_mol))
h_d = h_d %>%
  left_join(
    data_frame(
      lat_band = p$lat, 
      y_mol    = coordinates(m)[,2]),
    by = 'lat_band')
write_csv(
  h_d %>%
    select(-y_mol) %>%
    spread(bottom_zone, records) %>%
    arrange(desc(lat_band)), 'data/obis_hist_lat_records.csv')

lims = data_frame(
  lat   = c(-90, 90),
  y_mol = coordinates(m)[1:2,2])

# plot
h = ggplot(aes(x=y_mol, y=records, colour=bottom_zone), data=h_d) +
  geom_point() +
  geom_line() +
  scale_y_log10(
    breaks = trans_breaks("log10", function(x) 10^x),
    labels = trans_format("log10", math_format(10^.x))) +
  scale_color_discrete(name='depth (m)') +
  scale_x_continuous(
    breaks=coordinates(m)[3:(3+length(seq(-90, 90, by=10))),2],
    limits=lims$y_mol) +
  coord_cartesian(
    xlim = lims$y_mol, 
    ylim = c(1,max(h_d$records*1.1)), expand=F) +
  theme_bw() +
  coord_flip() +
  theme(
    axis.title.y = element_blank(), 
    axis.ticks.y = element_blank(), 
    axis.text.y = element_blank(),
    legend.justification = c(0,0),
    legend.position      = c(0,0.65))

# print to pdf
h_pdf = 'fig/obis_hist.pdf'
pdf(h_pdf, width=2, height=6)
print(h)
invisible(dev.off())
system(sprintf('open %s', h_pdf))

# output
h

time-series for densest cell

For the 1 deg cell with the maximum OBIS record density show records by year for coastal (0-5 m) and pelagic (>5 m).

# get records over years for cell
d_s = pts@data %>%
  rownames_to_column() %>%
  filter(
    #  rasterize: points that fall on a border between cells are placed
    #             in the cell to the right and/or in the cell below
    lon > max_xy[1]-0.5, lon <= max_xy[1]+0.5,
    lat > max_xy[2]-0.5, lat <= max_xy[2]+0.5) %>%
  group_by(year, bottom_zone) %>%
  summarize(
    records     = sum(records)) %>%
  ungroup() %>%
  mutate(
    records_cum     = cumsum(records),
    records_cum_pct = records_cum / sum(records)) %>%
  filter(
    year >= 2000)

# get summaries of previous and total across bottom_zones
records_prev = d_s %>% 
  filter(year == 2000) %>%
  summarize(
    records_prev = sum(records_cum) - sum(records)) %>%
  .$records_prev
records_tot = d_s %>% 
  filter(year == max(year)) %>%
  summarize(
    records_tot = sum(records_cum)) %>%
  .$records_tot

# plot time series
s = ggplot(d_s, aes(x=year, y=records, color=bottom_zone)) +
  geom_point() +
  geom_line() +
  scale_y_log10(
    breaks = trans_breaks("log10", function(x) 10^x),
    labels = trans_format("log10", math_format(10^.x))) +
  scale_color_discrete(name='depth (m)') +
  theme(legend.justification=c(0,1), legend.position=c(0,1)) +
  theme_bw()

# print to pdf
s_pdf = 'fig/obis_ts.pdf'
pdf(s_pdf, width=3, height=1.5)
print(s)
invisible(dev.off())
system(sprintf('open %s', s_pdf))

# show s
s

The max value 70.3804142 is found at -80.5, 25.5. Although the time series for this cell started in 1903, only 1236 of 1565733 records were observed prior to 2000.

Here’s the table of values in the time-series histogram used since 2000.

DT::datatable(d_s)

map, hist, time-series combined

Manually combined the map and histogram in Adobe Illustrator from the pdf outputs: obis_map.pdf + obis_hist.pdf -> obis_ts.pdf.

Figure 1. Density of OBIS records per km2 by 1 degree cells on log10 scale observed in shallow water (<20m) at coastal (0-5m) and pelagic (>5m) locations (by bathymetric depth) having a valid year and occurring in the ocean. The map and latitudinal histogram (right) highlight the lack of observations spatially, particularly in the tropics and southern hemisphere. Even for the highlighted cell (red dot in Florida, US) having the highest observational density (70.4 records/km2), the number of records varies widely across years (lower left). Routine satellite remote sensing combined with modeling promise to fill these widespread data gaps in space and time for assessment of marine biodiversity in the coastal and pelagic zones globally.

old: map, hist for all depths

For comparison, here’s the combined map and histogram for all depths (ie not filtered by sample_depth <= 20m) and regardless of valid year: obis_map_hist_all-depths.pdf.