• 1 Occurrence data
• 1.1 Setting up your workspace
• 1.2 Zero filling
• 2 Environmental data
• 3 Model fitting
• 4 Model validation

The script and data in this file draws on several sources, including:

1. The ATLAS of Living Australia (ALA), script modified from script written by Jenna Wraith, ALA Training and Outreach Coordinator
1. Stevenson M, Westgate M, Newman P (2022). galah: Atlas of Living Australia (ALA) Data and Resources in R. R package version 1.3.1
2. See for vingnettes on how to use the package
2. Global Biodiversity Information Facility (GBIF)

# 1 Occurrence data

## 1.1 Setting up your workspace

First, if you are new to using R, we strongly suggest you visit this website and fo through that material before trying this.

You should be able to run all this code in EcoCommons’ coding cloud using either a jupyter notebook (R Kernal) or by importing this notebook into R studio as an R Markdown.

Let’s start by installing and loading the following packages (need to install and load the package pacman first!):

pacman::p_load(galah, rgbif, maps, tidyr, jpeg, raster, rgeos, sp, knitr, rmarkdown)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) #sets your working directory to the location your scrip or markdown is saved in direct <- "/Users/s2992269/Documents/Use_cases" folder <- "/SDM_in_R" knitr::opts_knit$set(root.dir=paste0(direct, folder))

Create a variety of folders in your working directory, if statement ignores this if the folder exists already.

raw_data_folder <- paste0(getwd(),"/raw_data")
if (!file.exists(raw_data_folder)) {dir.create(raw_data_folder)}

data_folder   <- paste0(getwd(),"/data")
if (!file.exists(data_folder)) {dir.create(data_folder)}

results_folder <- paste0(getwd(),"/results")
if (!file.exists(results_folder)) {dir.create(results_folder)}

results_folder <- paste0(getwd(),"/results1")
if (!file.exists(results_folder)) {dir.create(results_folder)}

results_folder <- paste0(getwd(),"/results2")
if (!file.exists(results_folder)) {dir.create(results_folder)}

results_folder <- paste0(getwd(),"/results3")
if (!file.exists(results_folder)) {dir.create(results_folder)}

results_folder <- paste0(getwd(),"/results4")
if (!file.exists(results_folder)) {dir.create(results_folder)}

results_folder <- paste0(getwd(),"/results_brt")
if (!file.exists(results_folder)) {dir.create(results_folder)}

results_folder <- paste0(getwd(),"/results_glm")
if (!file.exists(results_folder)) {dir.create(results_folder)}

results_folder <- paste0(getwd(),"/results_gam")
if (!file.exists(results_folder)) {dir.create(results_folder)}

scripts_folder <- paste0(getwd(),"/scripts")
if (!file.exists(scripts_folder)) {dir.create(scripts_folder)}

raw_data_folder <- paste0(getwd(),"/predictors")
if (!file.exists(raw_data_folder)) {dir.create(raw_data_folder)}

raw_data_folder <- paste0(getwd(),"/predictors_future")
if (!file.exists(raw_data_folder)) {dir.create(raw_data_folder)}

Set galah_config by adding your email. galah_config(") This email needs to be registered with ALA. You can register here.

galah_config(email="?@gmail.com") #again put your email in here

Select an ATLAS. These configuration options will only be saved for this session. Set preserve=TRUE to preserve them for future sessions.

show_all_atlases() #we will be using the Australian ATLAS data
galah_config(atlas="Australia")
show_all_fields() #show all the fields or columns of data within in the ALA

Find the kinds of data in each field replace the field name of interest in quotes.

search_fields("australian states and territories")
search_fields("coordinateUncertaintyinMeters")
search_fields("occurrenceID")
search_fields("cl22") #field values
search_taxa("Amphibia")
search_taxa(genus="Limnodynastes")
search_taxa(species="Limnodynastes peroni")

Download some occurence data. The galah_call function starts a query, then galah_identify selects the taxa you are interested in, galah_filter selects records for specified columns and atlas_occurrences retrieves the specified occurrence records. Look for records submitted by FrogID:

search_field_values("datasetName") #note you can see more rows, click lower right below

First check the number of records your query will return, there are 67,000+ records in ALA.

galah_call() %>%
galah_identify("Limnodynastes peroni") %>%
atlas_counts()

Then filter those records so only those records from “FrogID” are returned for this species and remove records with a coordinate uncertainty greater than 100m, there are 36,000+ records from the FrogID dataset, and with only coordinates < 100m precision.

galah_call() %>%
galah_identify("Limnodynastes peroni") %>%
galah_filter(datasetName == "FrogID") %>%
galah_filter(stateProvince == "Queensland") %>%
galah_filter(coordinateUncertaintyInMeters < 100) %>%
atlas_counts()

Select the occurrence records, galah_select returns wanted columns. We could also filter by year, but FrogID data is all pretty recent. Often you will want to ensure you are not including really old records in your data (i.e., galah_filter(year >= 2020)).

LiPe <- galah_call() %>%
galah_identify("Limnodynastes peroni") %>%
galah_filter(datasetName == "FrogID") %>%
galah_filter(coordinateUncertaintyInMeters < 100) %>%
galah_filter(stateProvince == "Queensland") %>%
atlas_occurrences()

Get familiar with data; this will return the column names or fields.

head(LiPe)
names(LiPe)
write.csv(LiPe, "raw_data/Limnodynastes peroni.csv")
summary(LiPe) #generate a summary of those data

Drop columns not needed in analyses. Note you will often want to filter on additional fields not shown here.

LiPe <- LiPe[,c("decimalLatitude",
"decimalLongitude",
"eventDate",
"scientificName")]
head(LiPe) #look at the top rows of the new dataset

This is the earliest date, 2017 is fairly recent, and we want to make sure our predictor variables are available for these years.

min(LiPe$eventDate) #this is the earliest date, 2017 is fairly recent max(LiPe$eventDate) #this is the most recent date, 2020
write.csv(LiPe, "data/Limnodynastes peroni.csv") save

Double check the directory within the chunks is correct, noting the directory outside the code chunks might be different (getwd()). Next, read in data and overwrite LiPe:

LiPe <- read.csv(paste0(getwd(),"/data/Limnodynastes peroni.csv"))

FrogID records are pretty clean, but often when you map data you will see odd locations

map("world", xlim=range(LiPe$decimalLongitude), ylim=range(LiPe$decimalLatitude))
points(LiPe[ , c("decimalLongitude", "decimalLatitude")], pch=".")

When you look at these records mapped, you can see that most records are concentrated where people are near the bigger cities of eastern Australiathis kind of sampling bias is common in Australian occurrence data. It takes so much more effort to sample in remote locations.

There are a few things you can do to reduce the problem of bias, you can break up your study area into areas near cities, and more remote areas (we will not do that here).

You can reduce the number of records close to one another. This reduces spatial autocorrelation, and is a good step for these kinds of records before we show how to do spatial - thinning (reducing how close records are together) we need a base raster layer

In all the modelling we will show here, each environmental predictor needs to cover the same area (have the same extent), have grid cells that are the same size (same resolution), and use the same coordinate system to define a method for turning a spherical planet earth into a flat map (same coordinate system and map projection, i.e., Coordinate Reference System or CRS)

A base layer is one with a CRS, extent and resolution that all other layers will be turned into. Generally it is a good idea to use the largest possible extent, i.e. you often get better predictions when you include the entire range of your species, and when you use the finest (smallest grid cells) possible. Smaller grid cells only help prediction when the values vary from one grid cell to the next grid cell. I recommed choosing the variable that is most closely related to the ecology of your species and has the smallest resolution (grid cell size). Which variable do you think will predict best where your species are found?

This is Enhanced Vegetation Index (EVI) which captures greenness index while correcting ndvi issues, from 2004 downloaded from the EcoCommons site.

EVI <- raster("raw_data/ndlc-2004-250m_trend-evi-mean.tif")
plot(EVI)
EVI

LiPe_pts <- SpatialPoints(coords=cbind(LiPe$decimalLongitude, LiPe$decimalLatitude),
CRS(as.character("+proj=longlat +datum=WGS84 +no_defs")) )

require(rgeos)
LiPe_convH <- rgeos::gConvexHull(LiPe_pts)

Check if your convex hull looks correct by plotting useful information on working with spatial data in R.

plot(EVI)
lines(LiPe_convH)

Increase the size of your extent (study area to beyone the extent of your point data) our CRS is in decimal degrees so 1 degree ~ 111km larger.

LiPe_extent <- rgeos::gBuffer(LiPe_convH, width=1)

Check if your convex hull or extent is now larger

plot(EVI)
lines(LiPe_extent)

EVI_LiPe <- raster::crop(EVI,LiPe_extent)
plot(EVI_LiPe)
EVI_LiPe

plot(EVI_LiPe_mask)

Create your reference raster, all other rasters will be set to this resolution, extent and CRS, dividing raster by itself sets all values to 1.

base <- EVI_LiPe_mask/EVI_LiPe_mask
crs(base) <- crs(EVI)
plot(base)
writeRaster(base,"data/base_LiPe.asc", overwrite=TRUE)

Run these lines if you are coming back to the script to load your base layer:

require(raster)
base <- raster("data/base_LiPe.asc")
plot(base) #check that it looks correct

Spatially thin occurrence records so that only one record is selected randomly from within a Grid cell that is 4 times larger than “base”. Aggregate reduce resolution (make grid cells larger) (factor=4) notice grid cells are now 4 times larger.

large_base <- aggregate(base, fact=4)
res(base)
res(large_base)

LiPe_pts <- sp::SpatialPoints(coords=cbind(LiPe$decimalLongitude, LiPe$decimalLatitude),
CRS(as.character("+proj=longlat +datum=WGS84 +no_defs")) )

cell_no <- raster::extract(large_base,
LiPe_pts,
cellnumbers=TRUE)

LiPe_cells <- cbind(LiPe,cell_no)

require(dplyr)
LiPe_thinned <- LiPe_cells %>%
group_by(cells) %>%
slice_sample(n=1)
length(LiPe_thinned$cells) Plot to look at results and write results to file: LiPe_pts2 <- SpatialPoints(coords=cbind(LiPe_thinned$decimalLongitude,
LiPe_thinned\$decimalLatitude),
CRS(as.character("+proj=longlat +datum=WGS84 +no_defs")))
plot(base)
points(LiPe_pts2,pch=20,cex=0.2)
write.csv(LiPe_thinned,"data/LiPe_thinned.csv")

Download all FrogID survey records to capture survey effort, spatial sampling bias, and to zero-fill.

galah_call() %>%
galah_identify("Amphibia") %>%
galah_filter(datasetName == "FrogID") %>%
galah_filter(coordinateUncertaintyInMeters < 100) %>%
galah_filter(stateProvince == "Queensland") %>%
galah_select("datasetName") %>%
atlas_counts()

frogs <- galah_call() %>%
galah_identify("Amphibia") %>%
galah_filter(datasetName == "FrogID") %>%
galah_filter(coordinateUncertaintyInMeters < 100) %>%