FuhuXia · November 24, 2021 16:00
diff --git a/find-duplicate-ho-id.py b/find-duplicate-ho-id.py
 #!/usr/bin/env python

 ```
 USAGE: python find-duplicate-ho-id.py > ids.txt
 ```

 import urllib2
 import json
 import math
 import sys
 import time
 import datetime
 import logging

 SERVER = "https://catalog.data.gov"
 PAGINATION_SIZE = 1000
 TIMEOUT = 20

 query_url = SERVER + "/api/action/package_search?fq=((collection_package_id:*%20OR%20*:*)+AND+source_datajson_identifier:true)"

 def get_count(query_url):
    url_count = query_url + "&rows=0"
    response = urllib2.urlopen(url_count)
    response_dict = json.loads(response.read())
    assert response_dict['success'] is True
    return response_dict['result']['count']

 def main():
    count = get_count(query_url)
    pages = int(math.ceil(1.0*count/PAGINATION_SIZE))

    logging.basicConfig(
            stream=sys.stderr,
            level=logging.INFO,
            format='%(message)s'
    )
    logging.info("%s records to go." % count)

    # get datasets for each page and write to stdout
    for i in range(pages):
        log = '{0:.19} doing page {1}/{2}...'.format(
                str(datetime.datetime.now()),
                i + 1,
                pages
        )
        url = '%s&rows=%s&start=%s' % (
                query_url,
                PAGINATION_SIZE,
                i * PAGINATION_SIZE
        )
        response_dict = None
        attempts = 1
        while attempts < 50:
            try:
                response = urllib2.urlopen(url, timeout = TIMEOUT)
            except urllib2.HTTPError, e:
                logging.error('Error Code: %s. Message: %s. Retry %s' % (
                        e.code,
                        e.read(),
                        attempts
                ))
            except KeyboardInterrupt:
                raise
            except:
                logging.error("Unexpected error: %s", sys.exc_info()[0])
            else:
                response_dict = json.loads(response.read())
                assert response_dict['success'] is True
                logging.info('%s done.' % log)
                break
            wait_time = 2 * attempts # wait longer with each failed attempt
            logging.info('wait %s seconds before next attempt...' % wait_time)
            time.sleep(wait_time)
            attempts += 1

        assert response_dict['success'] is True
        datasets = response_dict['result']['results']
        if not datasets:
            logging.info('got nothing on page %s.' % i)
            quit()

        for n, dataset in enumerate(datasets):
            logging.debug('    %s/%s: %s' % (
                    n + 1 + PAGINATION_SIZE * i,
                    count,
                    dataset['name']
            ))
            found_ho_id = False
            for extra in dataset['extras']:
                if extra['key'] == 'harvest_object_id':
                    if found_ho_id:
                        print("%s" % dataset['id'])
                        break
                    found_ho_id = True


 if __name__ == '__main__':
    main()
	#!/usr/bin/env python

	```
	USAGE: python find-duplicate-ho-id.py > ids.txt
	```

	import urllib2
	import json
	import math
	import sys
	import time
	import datetime
	import logging

	SERVER = "https://catalog.data.gov"
	PAGINATION_SIZE = 1000
	TIMEOUT = 20

	query_url = SERVER + "/api/action/package_search?fq=((collection_package_id:%20OR%20:*)+AND+source_datajson_identifier:true)"

	def get_count(query_url):
	url_count = query_url + "&rows=0"
	response = urllib2.urlopen(url_count)
	response_dict = json.loads(response.read())
	assert response_dict['success'] is True
	return response_dict['result']['count']

	def main():
	count = get_count(query_url)
	pages = int(math.ceil(1.0*count/PAGINATION_SIZE))

	logging.basicConfig(
	stream=sys.stderr,
	level=logging.INFO,
	format='%(message)s'
	)
	logging.info("%s records to go." % count)

	# get datasets for each page and write to stdout
	for i in range(pages):
	log = '{0:.19} doing page {1}/{2}...'.format(
	str(datetime.datetime.now()),
	i + 1,
	pages
	)
	url = '%s&rows=%s&start=%s' % (
	query_url,
	PAGINATION_SIZE,
	i * PAGINATION_SIZE
	)
	response_dict = None
	attempts = 1
	while attempts < 50:
	try:
	response = urllib2.urlopen(url, timeout = TIMEOUT)
	except urllib2.HTTPError, e:
	logging.error('Error Code: %s. Message: %s. Retry %s' % (
	e.code,
	e.read(),
	attempts
	))
	except KeyboardInterrupt:
	raise
	except:
	logging.error("Unexpected error: %s", sys.exc_info()[0])
	else:
	response_dict = json.loads(response.read())
	assert response_dict['success'] is True
	logging.info('%s done.' % log)
	break
	wait_time = 2 * attempts # wait longer with each failed attempt
	logging.info('wait %s seconds before next attempt...' % wait_time)
	time.sleep(wait_time)
	attempts += 1

	assert response_dict['success'] is True
	datasets = response_dict['result']['results']
	if not datasets:
	logging.info('got nothing on page %s.' % i)
	quit()

	for n, dataset in enumerate(datasets):
	logging.debug(' %s/%s: %s' % (
	n + 1 + PAGINATION_SIZE * i,
	count,
	dataset['name']
	))
	found_ho_id = False
	for extra in dataset['extras']:
	if extra['key'] == 'harvest_object_id':
	if found_ho_id:
	print("%s" % dataset['id'])
	break
	found_ho_id = True


	if __name__ == '__main__':
	main()