Skip to content

Instantly share code, notes, and snippets.

@FuhuXia
Last active November 24, 2021 16:00
Show Gist options
  • Save FuhuXia/fd82852421c81f42b36ce135a7d3cdb3 to your computer and use it in GitHub Desktop.
Save FuhuXia/fd82852421c81f42b36ce135a7d3cdb3 to your computer and use it in GitHub Desktop.
Find CKAN records of data.json type with duplicate harvest_object_id
#!/usr/bin/env python
```
USAGE: python find-duplicate-ho-id.py > ids.txt
```
import urllib2
import json
import math
import sys
import time
import datetime
import logging
SERVER = "https://catalog.data.gov"
PAGINATION_SIZE = 1000
TIMEOUT = 20
query_url = SERVER + "/api/action/package_search?fq=((collection_package_id:*%20OR%20*:*)+AND+source_datajson_identifier:true)"
def get_count(query_url):
url_count = query_url + "&rows=0"
response = urllib2.urlopen(url_count)
response_dict = json.loads(response.read())
assert response_dict['success'] is True
return response_dict['result']['count']
def main():
count = get_count(query_url)
pages = int(math.ceil(1.0*count/PAGINATION_SIZE))
logging.basicConfig(
stream=sys.stderr,
level=logging.INFO,
format='%(message)s'
)
logging.info("%s records to go." % count)
# get datasets for each page and write to stdout
for i in range(pages):
log = '{0:.19} doing page {1}/{2}...'.format(
str(datetime.datetime.now()),
i + 1,
pages
)
url = '%s&rows=%s&start=%s' % (
query_url,
PAGINATION_SIZE,
i * PAGINATION_SIZE
)
response_dict = None
attempts = 1
while attempts < 50:
try:
response = urllib2.urlopen(url, timeout = TIMEOUT)
except urllib2.HTTPError, e:
logging.error('Error Code: %s. Message: %s. Retry %s' % (
e.code,
e.read(),
attempts
))
except KeyboardInterrupt:
raise
except:
logging.error("Unexpected error: %s", sys.exc_info()[0])
else:
response_dict = json.loads(response.read())
assert response_dict['success'] is True
logging.info('%s done.' % log)
break
wait_time = 2 * attempts # wait longer with each failed attempt
logging.info('wait %s seconds before next attempt...' % wait_time)
time.sleep(wait_time)
attempts += 1
assert response_dict['success'] is True
datasets = response_dict['result']['results']
if not datasets:
logging.info('got nothing on page %s.' % i)
quit()
for n, dataset in enumerate(datasets):
logging.debug(' %s/%s: %s' % (
n + 1 + PAGINATION_SIZE * i,
count,
dataset['name']
))
found_ho_id = False
for extra in dataset['extras']:
if extra['key'] == 'harvest_object_id':
if found_ho_id:
print("%s" % dataset['id'])
break
found_ho_id = True
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment