Skip to content

Instantly share code, notes, and snippets.

@FuhuXia
Last active August 21, 2024 20:52
Show Gist options
  • Save FuhuXia/7cf429f5650c6bea5fc621790592bc0d to your computer and use it in GitHub Desktop.
Save FuhuXia/7cf429f5650c6bea5fc621790592bc0d to your computer and use it in GitHub Desktop.
harvest source xml analysis
import requests
from time import sleep
CKAN_URL = 'https://catalog.data.gov'
# call api and get the list of harvest sources
harvest_source_url = f'{CKAN_URL}/api/action/package_search?fq=(dataset_type:harvest)&rows=1000'
response = requests.get(harvest_source_url)
harvest_sources = response.json()['result']['results']
# print all source_types of the harvest sources and their count
source_types = {}
for source in harvest_sources:
source_type = source['source_type']
if source_type in source_types:
source_types[source_type] += 1
else:
source_types[source_type] = 1
for source_type, count in source_types.items():
print(f'{source_type}: {count}')
# filter out the harvest sources which are not waf sources
harvest_sources = [source for source in harvest_sources if (
source['source_type'] in ['waf', 'waf-collection']
)]
# for all harvest sources, print the name and the number of datasets, and the last harvest object id
count = 0
for source in harvest_sources:
source_detail_api = f'{CKAN_URL}/api/action/harvest_source_show?id={source["id"]}'
response = requests.get(source_detail_api)
source_detail = response.json()['result']
dataset_count= source_detail["status"]["total_datasets"]
if dataset_count > 0:
count += 1
print(f'Harvest Source {count}: {dataset_count} datasets in {source["name"]}')
harvest_object_api = f'{CKAN_URL}/api/action/package_search?fq=(harvest_source_id:{source["id"]} and collection_package_id:*%20OR%20*:*)&sort=metadata_modified%20desc&rows=1'
response = requests.get(harvest_object_api)
results = response.json()['result']['results'][0]['extras']
for result in results:
if result['key'] == 'harvest_object_id':
harvest_object_id = result['value']
break
object_detail_api = f'{CKAN_URL}/api/action/harvest_object_show?id={harvest_object_id}'
response = requests.get(object_detail_api)
results = response.json()['result']['extras']
if results.get('original_format'):
format = "FGDC"
trailing = '/original'
else:
format = "ISO"
trailing = ''
print(f'{format} XML sample: {CKAN_URL}/harvest/object/{harvest_object_id}{trailing}')
# sleep for 1 second to avoid rate limiting
sleep(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment