Last active
November 21, 2022 19:52
-
-
Save daltare/2de1517ad1e315c4b1cad01278de96dd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This example shows one way to access subsets of CEDEN data via the California Open Data Portal API (it's not | |
# necessarily the only or best way though) - if you find any problems or have questions, please contact: [email protected] | |
# This example applies to the following CEDEN datasets, which have been split into separate | |
# resources by year (due to file size limitations) | |
# Water Chemistry: https://data.ca.gov/dataset/surface-water-chemistry-results | |
# Habitat: https://data.ca.gov/dataset/surface-water-habitat-results | |
# Tissue: https://data.ca.gov/dataset/surface-water-aquatic-organism-tissue-sample-results | |
# It assumes that you have your own API key saved in a system environment variable | |
# named: "data_portal_key" (to obtain an API key, create an account at data.ca.gov, then go to your | |
# account page and look for the "API Key" header in the lower left side of the page) | |
# NOTE: There appears to be a limit of 32,000 records that can be returned from the API in a single call -- if your API query | |
# is likely to return more records than that limit, you'll need to break it up into multiple calls to the API | |
# The function below returns a warning message when this occurs | |
# NOTE: to query across all years in this dataset, you'll need to enter the package/dataset ID for the chemistry data, | |
# rather than the ID for an individual resource (since the data is split into multiple years, need to query across | |
# multiple resources) - the package IDs for the datasets this example applies to are: | |
# Water Chemistry: 28d7a81d-6458-47bd-9b79-4fcbfbb88671 | |
# Habitat: f5edfd1b-a9b3-48eb-a33e-9c246ab85adf | |
# Tissue: 38cb5cca-1500-42e7-b359-e8e3c5d1e087 | |
# load packages | |
library(ckanr) | |
library(dplyr) | |
library(lubridate) | |
library(httr) | |
library(jsonlite) | |
library(glue) | |
library(readr) | |
library(stringr) | |
library(purrr) | |
# set the option to use standard notation rather than scientific notation | |
options(scipen = 999) | |
# create a generic function to run an API query and return the formatted results, given a URL for the query | |
generic_api_query <- function(query_url) { | |
# encode URL | |
url_encoded <- URLencode(query_url) | |
# get the data | |
query_response <- httr::GET(url_encoded, add_headers(Authorization = Sys.getenv('data_portal_key'))) | |
query_char <- rawToChar(query_response$content) | |
query_content <- jsonlite::fromJSON(query_char) | |
resource_records <- query_content$result$records | |
# get information about the fields included in the dataset (allows for reording the fields to match the original dataset) | |
resource_fields <- query_content$result$fields %>% | |
filter(!id %in% c('_id', '_full_text')) %>% # drop these two fields, which are added by the portal and not actually part of the dataset | |
pull(id) | |
# format the data- convert to tibble, set the data types, and reorder the fields to match the order of the original dataset | |
resource_records <- tibble(resource_records) %>% | |
# type_convert(na = c("", "NaN")) %>% # move this part to the function below, so that it's not done separately each time | |
mutate_all(as.character) %>% # to make sure the data frames from each year can be combined | |
select(all_of(resource_fields)) | |
# print a warning if the API limit was reached by the query | |
if (nrow(resource_records) == 32000) { | |
warning('WARNING: the API record limit was reached - it\'s likely that some records matching your query were not returned') | |
} | |
# return the results | |
return(resource_records) | |
} | |
# create a function that takes a single year, a dataframe of resource_ids (by year), and the API filters, then returns the correct id | |
format_url_year <- function(query_year, resource_id_table, filters_date_start, filters_date_end, filters_text, filters_numeric, filters_numeric_drop_na) { | |
# get the resource ID corresponding to the given year | |
resource_id <- resource_id_table %>% | |
filter(year == query_year) %>% | |
pull(resource_id) | |
# get the range of start and end dates (in case there are different dates entered for different fields) | |
dates_start <- range(as.Date(unname(filters_date_start))) | |
dates_end <- range(as.Date(unname(filters_date_end))) | |
# get the years to use the starting date filters for | |
if (!is.na(filters_date_start)[1]) { | |
use_date_query_years_start <- c(year(min(dates_start)):year(max(dates_start))) | |
} else ( | |
use_date_query_years_start <- c() | |
) | |
# get the years to use the ending date filters for | |
if (!is.na(filters_date_end)[1]) { | |
use_date_query_years_end <- year(min(dates_end)):year(max(dates_end)) # just in case there are multiple start or end dates, make sure the date filters are applied to years needed | |
} else { | |
use_date_query_years_end <- c() | |
} | |
# format the filter for minimum/starting dates | |
if (!is.na(filters_date_start[1]) & (query_year %in% use_date_query_years_start)) { | |
query_date_start <- map2_chr(filters_date_start, names(filters_date_start), ~glue("\"{.y}\" >= '{.x}'")) %>% | |
unname() %>% | |
glue_collapse(sep = ' AND ') | |
} else { | |
query_date_start <- c() | |
} | |
# format the filters for maximum/ending dates | |
if (!is.na(filters_date_end[1]) & (query_year %in% use_date_query_years_end)) { | |
query_date_end <- map2_chr(filters_date_end, names(filters_date_end), ~glue("\"{.y}\" <= '{.x}'")) %>% | |
unname() %>% | |
glue_collapse(sep = ' AND ') | |
} else { | |
query_date_end <- c() | |
} | |
# format the filters for text values | |
if (!is.na(filters_text[1])) { | |
query_text_filter <- map2_chr(filters_text, names(filters_text), ~glue("\"{.y}\" = '{.x}'")) %>% | |
unname() %>% | |
glue_collapse(sep = ' AND ') | |
} else { | |
query_text_filter <- c() | |
} | |
# format the filters for numeric values | |
if (!is.na(filters_numeric[1])) { | |
query_numeric_filter <- map2_chr(filters_numeric, names(filters_numeric), ~glue("\"{.y}\" {.x}")) %>% | |
unname() %>% | |
glue_collapse(sep = ' AND ') | |
} else { | |
query_numeric_filter <- c() | |
} | |
# format the filters to drop missing/null numeric values | |
if (!is.na(filters_numeric_drop_na[1])) { | |
query_numeric_drop_na <- map_chr(filters_numeric_drop_na, ~glue("\"{.x}\" <> '{NaN}'")) %>% | |
# unname() %>% | |
glue_collapse(sep = ' AND ') | |
} else { | |
query_numeric_drop_na <- c() | |
} | |
# combine all of the different filter types into one statement | |
filters_combined <- glue_collapse(c(query_date_start, query_date_end, query_text_filter, query_numeric_filter, query_numeric_drop_na), sep = ' AND ') | |
# create URL for the API call | |
api_url <- glue("https://data.ca.gov/api/3/action/datastore_search_sql?sql=SELECT * from \"{resource_id}\"") | |
if (length(filters_combined) > 0) { # if there are additional filters, add them | |
api_url <- glue("{api_url} WHERE {filters_combined}") | |
} | |
return(api_url) | |
} | |
# create a function to download data from the API, given an input resource ID and optional filters for date, text, and numeric fields | |
get_api_data <- function(package_id, filters_date_start = NA, filters_date_end = NA, filters_text = NA, filters_numeric = NA, filters_numeric_drop_na = NA) { | |
# NOTES: any of the filters_... variables can be left blank | |
# for date and text filters, enter inputs as named character vectors (multiple name-value pairs can be enterd) -- e.g: c('field_name' = 'query_value') | |
# for numeric filters, include the condition (>, <, >=, <=, <>) as part of the value (multiple name-condition pairs can be entered) -- e.g.: c('field_name' = '> query_value') | |
# for the filters_numeric_drop_na field, just enter the name of the field to drop missing numeric values from (multiple field names can be entered) -- e.g., c('field_name') | |
# get a table of the resource IDs in the dataset | |
package_info <- package_show(package_id, as = 'table', | |
url = 'https://data.ca.gov/', | |
key = Sys.getenv('data_portal_key')) | |
package_resources <- package_info$resources | |
package_resources <- package_resources %>% | |
filter(format %in% c('CSV')) %>% # filter for just the resources containing csv files | |
select(name, id) %>% # drop un-needed fields | |
mutate(name = str_replace(name, pattern = 'CEDEN Water Chemistry Data Prior to 2000', '1999')) %>% # use 1999 to represent any year prior to 2000, since all pre-2000 data is in one resource | |
mutate(name = parse_number(name)) %>% # extract the year from the resource name | |
rename(year = name, resource_id = id) | |
# determine the range of resources to query, based on the start and end dates (check for multiple/different start or end dates for different fields) | |
dates_start <- range(as.Date(unname(filters_date_start))) | |
dates_end <- range(as.Date(unname(filters_date_end))) | |
# get the first year to query | |
if (is.na(filters_date_start[1]) | year(min(dates_start)) < 2000) { | |
year_start <- 1999 # treat 1999 as the first year, since all data prior to 2000 is in one resource/file | |
} else { | |
year_start <- year(min(dates_start)) | |
} | |
# get the last year to query | |
if (is.na(filters_date_end[1]) | year(max(dates_end)) > max(package_resources$year)) { | |
year_end <- max(package_resources$year) # only query up to the latest year for which data is available | |
} else { | |
year_end <- year(max(dates_end)) | |
} | |
# get the range of years to query | |
query_years_range <- year_start:year_end # get the minimum of the start dates, and the max of the end dates, to know which resources to query | |
# pass the query filter data to a separate function to build a url for each year, then run the query for each year | |
# and combine into a single data frame | |
return(map_df(query_years_range, ~ format_url_year(., package_resources, filters_date_start, | |
filters_date_end, filters_text, filters_numeric, | |
filters_numeric_drop_na) %>% | |
generic_api_query()) %>% | |
type_convert(na = c("", "NaN")) # detect field types and convert | |
) | |
} | |
#### EXAMPLE QUERY - GET DATA (using the water chemistry dataset) #### | |
# make an API call to get some data (in this case, get samples between 2016-07-01 and 2018-06-30, | |
# where the analyte name is "Oxygen, Dissolved, Total", the units are "mg/L", and the result value is | |
# greater than 10 and is not missing/null) | |
df_chemistry <- get_api_data(package_id = '28d7a81d-6458-47bd-9b79-4fcbfbb88671', | |
filters_date_start = c('SampleDate' = '2016-07-01'), | |
filters_date_end = c('SampleDate' = '2018-06-30'), | |
filters_text = c('Analyte' = 'Oxygen, Dissolved, Total', | |
'Unit' = 'mg/L'), | |
filters_numeric = c('Result' = '> 10'), | |
filters_numeric_drop_na = c('Result')) | |
#### EXAMPLE QUERY - VERIFY DATA #### | |
# verify that records with the correct analyte names and units were returned, and get the number of records | |
df_chemistry %>% count(Analyte, Unit) | |
# # A tibble: 1 x 3 | |
# Analyte Unit n | |
# <chr> <chr> <int> | |
# 1 Oxygen, Dissolved, Total mg/L 13589 | |
# Verify that the sample date range is correct | |
range(df_chemistry$SampleDate) | |
# [1] "2016-07-01 UTC" "2018-06-28 UTC" | |
# Verify that the result range is correct | |
range(df_chemistry$Result) | |
# [1] 10.00012 591.00000 | |
# view a summary of the entire dataset | |
glimpse(df_chemistry) | |
# Rows: 13,589 | |
# Columns: 72 | |
# $ Program <chr> "Central Coast Cooperative Monitoring Program for Agriculture", "Central... | |
# $ ParentProject <chr> "RWB3 Cooperative Monitoring Program", "RWB3 Cooperative Monitoring Prog... | |
# $ Project <chr> "RWB 3 Cooperative Monitoring Program", "RWB 3 Cooperative Monitoring Pr... | |
# $ StationName <chr> "Alisal Slough @ White Barn", "Bradley Channel @ Jones St", "Bradley Cha... | |
# $ StationCode <chr> "309ASB", "312BCJ", "312BCJ", "312BCJ", "312BCJ", "309BLA", "309BLA", "3... | |
# $ SampleDate <dttm> 2016-12-13, 2016-08-24, 2016-09-27, 2016-10-24, 2016-11-22, 2016-09-27,... | |
# $ CollectionTime <dttm> 1899-12-30 13:55:00, 1899-12-30 09:01:00, 1899-12-30 15:25:00, 1899-12-... | |
# $ LocationCode <chr> "Midchannel", "Midchannel", "Midchannel", "Midchannel", "Midchannel", "M... | |
# $ CollectionDepth <dbl> 0.12, 0.12, 0.12, 0.12, 0.09, 0.12, 0.12, 0.12, 0.03, 0.12, 0.12, 0.12, ... | |
# $ UnitCollectionDepth <chr> "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m", "m... | |
# $ SampleTypeCode <chr> "Not Recorded", "Not Recorded", "Not Recorded", "Not Recorded", "Not Rec... | |
# $ CollectionReplicate <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | |
# $ ResultsReplicate <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | |
# $ LabBatch <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ LabSampleID <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ MatrixName <chr> "samplewater", "samplewater", "samplewater", "samplewater", "samplewater... | |
# $ MethodName <chr> "FieldMeasure", "FieldMeasure", "FieldMeasure", "FieldMeasure", "FieldMe... | |
# $ Analyte <chr> "Oxygen, Dissolved, Total", "Oxygen, Dissolved, Total", "Oxygen, Dissolv... | |
# $ Unit <chr> "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", "mg/L", ... | |
# $ Result <dbl> 10.40, 10.86, 17.25, 11.56, 10.79, 12.60, 13.90, 10.13, 13.37, 11.50, 11... | |
# $ Observation <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ MDL <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ RL <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ ResultQualCode <chr> "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=", "=... | |
# $ QACode <chr> "None", "None", "None", "None", "None", "None", "None", "None", "None", ... | |
# $ BatchVerification <chr> "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", "NR", ... | |
# $ ComplianceCode <chr> "Com", "Com", "Com", "Com", "Com", "Com", "Com", "Com", "Com", "Com", "C... | |
# $ SampleComments <chr> NA, "Samples were returned to SMX without ever being sent to FGL. Resam... | |
# $ CollectionComments <chr> NA, NA, NA, "Chlorophyll a sample sent to Sierra", NA, NA, NA, NA, NA, N... | |
# $ ResultsComments <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ BatchComments <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ EventCode <chr> "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", "WQ", ... | |
# $ ProtocolCode <chr> "CMP_QAPP_2012", "CMP_QAPP_2012", "CMP_QAPP_2012", "CMP_QAPP_2012", "CMP... | |
# $ SampleAgency <chr> "PER", "TetraTech", "TetraTech", "TetraTech", "TetraTech", "PER", "PER",... | |
# $ GroupSamples <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ CollectionMethodName <chr> "Field Method", "Field Method", "Field Method", "Field Method", "Field M... | |
# $ Latitude <dbl> 36.72545, 34.94544, 34.94544, 34.94544, 34.94544, 36.70852, 36.70852, 34... | |
# $ Longitude <dbl> -121.7302, -120.4168, -120.4168, -120.4168, -120.4168, -121.7489, -121.7... | |
# $ CollectionDeviceDescription <chr> "PER_Hydrolab DS4a", "YSI 6820 v2", "YSI 6820 v2", "YSI 6820 v2", "YSI 6... | |
# $ CalibrationDate <dttm> 2016-12-13, 2016-08-23, 2016-09-27, 2016-10-24, 2016-11-22, 2016-09-27,... | |
# $ PositionWaterColumn <chr> "Subsurface", "Subsurface", "Subsurface", "Subsurface", "Subsurface", "S... | |
# $ PrepPreservationName <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ PrepPreservationDate <dttm> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,... | |
# $ DigestExtractMethod <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ DigestExtractDate <dttm> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,... | |
# $ AnalysisDate <dttm> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,... | |
# $ DilutionFactor <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ ExpectedValue <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ LabAgency <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ SubmittingAgency <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ SubmissionCode <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ OccupationMethod <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ StartingBank <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ DistanceFromBank <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ UnitDistanceFromBank <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ StreamWidth <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ UnitStreamWidth <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ StationWaterDepth <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ UnitStationWaterDepth <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ HydroMod <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ HydroModLoc <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ LocationDetailWQComments <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ ChannelWidth <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ UpstreamLength <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ DownStreamLength <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ TotalReach <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ LocationDetailBAComments <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ SampleID <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... | |
# $ DW_AnalyteName <chr> "Oxygen, Dissolved", "Oxygen, Dissolved", "Oxygen, Dissolved", "Oxygen, ... | |
# $ DataQuality <chr> "Unknown data quality", "Unknown data quality", "Unknown data quality", ... | |
# $ DataQualityIndicator <chr> "BatchVerification:NR", "BatchVerification:NR", "BatchVerification:NR", ... | |
# $ Datum <chr> "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", ... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment