# load libraries
suppressPackageStartupMessages({
library(raster)
library(rgeos)
library(readr)
library(tibble)
library(dplyr)
library(tidyr)
library(ggplot2)
library(sp)
library(rgdal)
library(ncdf4)
library(scales)
library(robis)
library(DT)
select = dplyr::select
})
Before filtering, summarize records by 5 decimal degrees at different depth bins.
# files
obis_rdata = 'data/obis.dat'
# load data provided by Peter Provoost (2016-10-03)
load(obis_rdata)
# summarize data by 5 deg bins and depth
d_sum <- data %>%
filter(
latitude <= 90 | latitude >= -90,
!is.na(depth)) %>%
mutate(
zone = cut(depth, c(0, 20, 200, Inf), labels=c('< 20', '20 - 200', '> 200')),
band = round(latitude / 5) * 5) %>%
group_by(band, zone) %>%
summarize(
records = n())
# histogram of depth by latitude
ggplot(d_sum, aes(x = band, y = records, fill = zone)) +
geom_bar(stat = 'identity') +
scale_fill_brewer(palette='YlGnBu', name='zone (m)') +
scale_y_continuous(expand = c(0, 0)) +
labs(x='latitude')
Using Land - 1:10m Physical Vectors | Natural Earth, (3.26 MB) version 3.0.1.
Derived from 10m coastline. Continental polygons broken into smaller, contiguous pieces to avoid having too many points in any one polygon, facilitating faster data processing in certain software applications. (below) Yucatan peninsula, Cuba, and Hispaniola.
land_rds = 'data/ne_10m_land.rds'
# projections
crs_mol = '+proj=moll +lon_0=0 +x_0=0 +y_0=0 +ellps=WGS84 +units=m +no_defs'
crs_gcs = '+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0'
if (!file.exists(land_rds)){
# download land from NE (Natural Earth)
land_zip_url = 'http://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/physical/ne_10m_land.zip'
land_zip = tempfile(fileext='.zip')
land_dir = tempdir()
download.file(land_zip_url, land_zip)
unzip(land_zip, exdir=land_dir)
# read in as SpatialPolygonsDataFrame
land = readOGR(dsn=land_dir, layer='ne_10m_land')
# project to Mollweide
#land_mol = spTransform(land, CRS(crs_mol))
# save
saveRDS(land, land_rds)
} else {
land = readRDS(land_rds)
}
The bathymetric depth comes from the GEBCO 30 arc-second grid published in 2014. Here’s there requested attribution:
GEBCO_2014 Grid, version 20150318, www.gebco.net
For the OBIS points, extract depth of locations and whether on land.
# files
depth_nc = 'data/GEBCO_2014_2D.nc'
obis_rdata = 'data/obis.dat'
pts_all_rds = 'data/pts_all.rds'
if (!file.exists(pts_all_rds)){
# load data provided by Peter Provoost (2016-10-03)
load(obis_rdata)
# spatial points
pts = data %>%
add_rownames() %>%
rename(lon = longitude, lat=latitude) %>%
mutate(
lon = ifelse(lon < -180, 360 + lon, lon),
x = lon,
y = lat)
coordinates(pts) = ~x+y
proj4string(pts) = CRS(crs_gcs)
# load depth
depth_r = raster(depth_nc, layer = 'elevation')
# extract bottom depth (~2 min)
system.time({
pts = raster::extract(depth_r, pts, method='bilinear', sp=T)}) # 149.559
# determine if on land ( min)
# try speeding up by chunking into sorted lon & lat
land_parts = sp::disaggregate(land)
system.time({
t0 = Sys.time()
pts_land = numeric(length(pts))
x = round(seq(1, length(pts), length.out = 1001))
for (i in 1:1000){ # i=500
ix = x[i]:x[i+1]
pts_land[ix] = sp::over(pts[ix,], land_parts)$featurecla
dt = difftime(Sys.time(), t0, units='hours')
cat(sprintf('%04.1f%% %0.2f hrs to go\n', i/10, dt / i * (1000 - i)))
}
}) # elapsed: ~1 hr
pts$land = !is.na(pts_land)
# update data frame
pts@data = pts@data %>%
rename(
sample_depth = depth,
bottom_depth = Elevation.relative.to.sea.level) %>%
mutate(
bottom_depth = bottom_depth * -1,
bottom_zone = cut(
bottom_depth, c(-Inf, -0.01, 5, Inf), labels=c('<0', '0 - 5', '>5'),
include.lowest=T, right=T))
# project to Mollweide, for hist matching map
pts_mol = spTransform(
pts,
CRS(crs_mol))
xy_mol = coordinates(pts_mol) %>% as.data.frame()
pts@data = bind_cols(
pts@data,
data.frame(
x_mol = xy_mol$x,
y_mol = xy_mol$y))
# save to pts_all
saveRDS(pts, pts_all_rds)
} else {
pts = readRDS(pts_all_rds)
}
Here’s a summary of the points before filtering:
# show summary of values with NA's
summary(pts) # n=4,234,200
## Object of class SpatialPointsDataFrame
## Coordinates:
## min max
## x -180 180
## y -90 90
## Is projected: FALSE
## proj4string :
## [+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0]
## Number of points: 4234200
## Data attributes:
## lon lat sample_depth year
## Min. :-180.00 Min. :-90.00 Min. : -9999.0 Min. :1520
## 1st Qu.: -89.55 1st Qu.: -8.99 1st Qu.: 7.5 1st Qu.:1984
## Median : -55.70 Median : 38.50 Median : 30.0 Median :1996
## Mean : -29.41 Mean : 20.03 Mean : 192.7 Mean :1991
## 3rd Qu.: 10.30 3rd Qu.: 48.62 3rd Qu.: 123.0 3rd Qu.:2003
## Max. : 180.00 Max. : 90.00 Max. :623624.0 Max. :2016
## NA's :2243109 NA's :268362
## records bottom_depth land bottom_zone
## Min. : 1.00 Min. :-5668.40 Mode :logical <0 : 593037
## 1st Qu.: 1.00 1st Qu.: 17.88 FALSE:3806354 0 - 5: 227559
## Median : 2.00 Median : 119.50 TRUE :427846 >5 :3413142
## Mean : 11.12 Mean : 918.27 NA's :0 NA's : 462
## 3rd Qu.: 7.00 3rd Qu.: 1210.75
## Max. :33764.00 Max. :10887.49
## NA's :462
## x_mol y_mol right
## Min. :-18035560 Min. :-9020048 Mode:logical
## 1st Qu.: -8134156 1st Qu.:-1109819 TRUE:4234200
## Median : -3964421 Median : 4621056 NA's:0
## Mean : -2462753 Mean : 2374513
## 3rd Qu.: 708197 3rd Qu.: 5728304
## Max. : 18040096 Max. : 9020048
##
# filter spatial data, using subset (not dplyr::filter)
pts = subset(
pts,
!is.na(lon) & !is.na(lat) & # lon,lat
!is.na(year) & # year
sample_depth <= 20 & !is.na(sample_depth) & # sample_depth
!land & # land
bottom_zone %in% c('0 - 5', '>5')) # bottom_zone
And after filtering:
# summarize points
summary(pts) # n=875,472
## Object of class SpatialPointsDataFrame
## Coordinates:
## min max
## x -180.0000 179.9998
## y -78.0498 90.0000
## Is projected: FALSE
## proj4string :
## [+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0]
## Number of points: 761731
## Data attributes:
## lon lat sample_depth year
## Min. :-180.000 Min. :-78.05 Min. :-999.9 Min. :1754
## 1st Qu.: -70.150 1st Qu.:-34.92 1st Qu.: 0.0 1st Qu.:1982
## Median : -9.458 Median : 38.91 Median : 7.5 Median :1997
## Mean : -7.503 Mean : 16.10 Mean : 5.9 Mean :1990
## 3rd Qu.: 33.510 3rd Qu.: 53.15 3rd Qu.: 7.5 3rd Qu.:2002
## Max. : 180.000 Max. : 90.00 Max. : 20.0 Max. :2014
## records bottom_depth land bottom_zone
## Min. : 1.00 Min. : -0.008 Mode :logical <0 : 0
## 1st Qu.: 2.00 1st Qu.: 26.000 FALSE:761731 0 - 5: 86862
## Median : 6.00 Median : 175.934 NA's :0 >5 :674869
## Mean : 14.29 Mean :1138.200
## 3rd Qu.: 14.00 3rd Qu.:2084.878
## Max. :5735.00 Max. :9329.469
## x_mol y_mol right
## Min. :-17976910 Min. :-8394926 Mode:logical
## 1st Qu.: -5663888 1st Qu.:-4213669 TRUE:761731
## Median : -708287 Median : 4667203 NA's:0
## Mean : -1001721 Mean : 1928800
## 3rd Qu.: 2466263 3rd Qu.: 6199229
## Max. : 18039794 Max. : 9020048
Summarize filtered points and project to Mollweide for more area-realistic map.
Get density of records by 1 degree cell by summing the field records
from points within each cell and dividing by the cell’s area.
# setup reference raster for global 1 deg cells
r = raster(extent(-180, 180, -90, 90), ncols=360, nrows=180, crs=crs_gcs)
# calculate number of records per reference 1 deg raster
# rasterize: points that fall on a border between cells are placed
# in the cell to the right and/or in the cell below
r_n = rasterize(pts, r, 'records', fun=sum) # plot(r_n)
# calculate area
r_km2 = area(r) # plot(r_km2)
# calculate density: number of records per km2
r_n_km2 = r_n / r_km2 # plot(r_n_km2)
plot(r_n_km2)
Plot the global map in Mollweide projection.
world = map_data('world')
# get max value for plotting time-series
max_i = which.max(r_n_km2)
max_xy = xyFromCell(r_n_km2, max_i)
max_v = r_n_km2[max_i] # 90.86918
m_pdf = 'fig/obis_map.pdf'
m = rasterVis::gplot(r_n_km2) +
geom_rect(
aes(xmin=x_min, xmax=x_max, ymin=y_min, ymax=y_max),
data=data.frame(x_min=-180, x_max=180, y_min=-90, y_max=90),
colour='black', fill='white', size=0.3, inherit.aes=F) +
geom_tile(aes(fill=log10(value))) + # , colour = NA
scale_fill_distiller(
type='div', palette='Spectral',
name='log10(records / km2)', na.value='white') +
geom_map(
data=world, map=world, aes(x=long, y=lat, map_id=region), fill = 'grey70') +
geom_point(
aes(x=x, y=y), as.data.frame(max_xy), color='red') +
coord_map("mollweide", xlim=c(-180, 180)) +
ylab('latitude') +
theme(
panel.background=element_blank(),
axis.text.x=element_blank(), axis.ticks.x=element_blank(), # x ticks
axis.title.x=element_blank(), # x title
panel.border=element_blank(), # panel
legend.position='bottom')
# print to pdf
pdf(m_pdf)
print(m)
invisible(dev.off())
system(sprintf('open %s', m_pdf))
# output
m
The most dense cell (center at -80.5, 25.5)
occ_csv = 'data/occurrences.csv'
occ_datasets_csv = 'data/occurrence_datasets.csv'
xy_wkt = function(x,y){
sprintf(
'POLYGON((%g %g,%g %g,%g %g,%g %g,%g %g))',
x-0.5, y-0.5, x+0.5, y-0.5, x+0.5, y+0.5, x-0.5, y+0.5, x-0.5, y-0.5)
}
bb_wkt = xy_wkt(x=max_xy[1], y=max_xy[2])
if (!file.exists(occ_datasets_csv)){
# retrieve occurrences for bbox from OBIS
system.time({
occ = occurrence(geometry=bb_wkt) # 1,051,906 records; ~ 1.5 hrs
})
write_csv(occ, occ_csv) # 621.6 MB
# summarize
occ_datasets =
# dataset, year
occ %>%
filter(yearcollected >= 2000) %>%
group_by(datasetName, yearcollected) %>%
summarize(
records = n()) %>%
spread(yearcollected, records) %>%
# dataset, records since 2000
left_join(
occ %>%
filter(yearcollected >= 2000) %>%
group_by(datasetName) %>%
summarize(since2000 = n()),
by='datasetName') %>%
# dataset, records since 2000
left_join(
occ %>%
group_by(datasetName) %>%
summarize(all = n()),
by='datasetName') %>%
# sort by biggest
arrange(desc(all), datasetName)
write_csv(occ_datasets, occ_datasets_csv, na='')
} else {
occ_datasets = read_csv(occ_datasets_csv)
}
# table
occ_datasets %>%
arrange(desc(all), datasetName) %>%
datatable() %>%
formatCurrency(2:ncol(occ_datasets), currency='', interval=3, mark=',', digits=0)
TODO: - Consider proportion of area available to coastal vs pelagic, after removing land and accounting for latitudinal variation.
Histogram of records using the same vertical axes as the Mollweide map for manually aligning after.
# bin records into 10 deg latitudinal bands
lats = seq(-85, 85, by=10)
h_d = data_frame(
lat_band = rep(lats, each=2),
bottom_zone = rep(c('0 - 5','>5'), length(lats))) %>%
left_join(
pts@data %>%
mutate(
lat_band = cut(lat, seq(-90, 90, by=10), labels = lats, include.lowest=T, right=T)) %>%
group_by(lat_band, bottom_zone) %>%
summarize(
records = sum(records)) %>%
ungroup() %>%
mutate(
lat_band = as.numeric(as.character(lat_band)),
bottom_zone = as.character(bottom_zone)) %>%
filter(bottom_zone %in% c('0 - 5','>5')),
by=c('lat_band','bottom_zone')) %>%
mutate(
records = ifelse(is.na(records), 0, records))
# get mollweide latitudinal coordinates
p = data_frame(lat = c(-90,90,seq(-90, 90, by=10),lats), lon = 0)
coordinates(p) = ~lon+lat
proj4string(p) = CRS(crs_gcs)
m = spTransform(p, CRS(crs_mol))
h_d = h_d %>%
left_join(
data_frame(
lat_band = p$lat,
y_mol = coordinates(m)[,2]),
by = 'lat_band')
write_csv(
h_d %>%
select(-y_mol) %>%
spread(bottom_zone, records) %>%
arrange(desc(lat_band)), 'data/obis_hist_lat_records.csv')
lims = data_frame(
lat = c(-90, 90),
y_mol = coordinates(m)[1:2,2])
# plot
h = ggplot(aes(x=y_mol, y=records, colour=bottom_zone), data=h_d) +
geom_point() +
geom_line() +
scale_y_log10(
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) +
scale_color_discrete(name='depth (m)') +
scale_x_continuous(
breaks=coordinates(m)[3:(3+length(seq(-90, 90, by=10))),2],
limits=lims$y_mol) +
coord_cartesian(
xlim = lims$y_mol,
ylim = c(1,max(h_d$records*1.1)), expand=F) +
theme_bw() +
coord_flip() +
theme(
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
legend.justification = c(0,0),
legend.position = c(0,0.65))
# print to pdf
h_pdf = 'fig/obis_hist.pdf'
pdf(h_pdf, width=2, height=6)
print(h)
invisible(dev.off())
system(sprintf('open %s', h_pdf))
# output
h
For the 1 deg cell with the maximum OBIS record density show records by year for coastal (0-5 m) and pelagic (>5 m).
# get records over years for cell
d_s = pts@data %>%
rownames_to_column() %>%
filter(
# rasterize: points that fall on a border between cells are placed
# in the cell to the right and/or in the cell below
lon > max_xy[1]-0.5, lon <= max_xy[1]+0.5,
lat > max_xy[2]-0.5, lat <= max_xy[2]+0.5) %>%
group_by(year, bottom_zone) %>%
summarize(
records = sum(records)) %>%
ungroup() %>%
mutate(
records_cum = cumsum(records),
records_cum_pct = records_cum / sum(records)) %>%
filter(
year >= 2000)
# get summaries of previous and total across bottom_zones
records_prev = d_s %>%
filter(year == 2000) %>%
summarize(
records_prev = sum(records_cum) - sum(records)) %>%
.$records_prev
records_tot = d_s %>%
filter(year == max(year)) %>%
summarize(
records_tot = sum(records_cum)) %>%
.$records_tot
# plot time series
s = ggplot(d_s, aes(x=year, y=records, color=bottom_zone)) +
geom_point() +
geom_line() +
scale_y_log10(
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) +
scale_color_discrete(name='depth (m)') +
theme(legend.justification=c(0,1), legend.position=c(0,1)) +
theme_bw()
# print to pdf
s_pdf = 'fig/obis_ts.pdf'
pdf(s_pdf, width=3, height=1.5)
print(s)
invisible(dev.off())
system(sprintf('open %s', s_pdf))
# show s
s
The max value 70.3804142 is found at -80.5, 25.5. Although the time series for this cell started in 1903
, only 1236 of 1565733 records were observed prior to 2000
.
Here’s the table of values in the time-series histogram used since 2000.
DT::datatable(d_s)
Manually combined the map and histogram in Adobe Illustrator from the pdf outputs: obis_map.pdf + obis_hist.pdf -> obis_ts.pdf.
Figure 1. Density of OBIS records per km2 by 1 degree cells on log10 scale observed in shallow water (<20m) at coastal (0-5m) and pelagic (>5m) locations (by bathymetric depth) having a valid year and occurring in the ocean. The map and latitudinal histogram (right) highlight the lack of observations spatially, particularly in the tropics and southern hemisphere. Even for the highlighted cell (red dot in Florida, US) having the highest observational density (70.4 records/km2), the number of records varies widely across years (lower left). Routine satellite remote sensing combined with modeling promise to fill these widespread data gaps in space and time for assessment of marine biodiversity in the coastal and pelagic zones globally.
For comparison, here’s the combined map and histogram for all depths (ie not filtered by sample_depth <= 20m) and regardless of valid year: obis_map_hist_all-depths.pdf.