vpetersson · October 8, 2019 13:54 · HQJaTu · Jul 11, 2018 · dvir-cdsoft · Feb 5, 2019
diff --git a/gistfile1.py b/gistfile1.py
 #! /usr/bin/env python
 # -*- coding: utf-8 -*-

 """
 Inspired by Craig Addyman (http://www.craigaddyman.com/parse-an-xml-sitemap-with-python/)
 Enhanced by Viktor Petersson (http://viktorpetersson.com) / @vpetersson
 """

 from bs4 import BeautifulSoup
 import requests


 def get_sitemap(url):
    get_url = requests.get(url)

    if get_url.status_code == 200:
        return get_url.text
    else:
        print 'Unable to fetch sitemap: %s.' % url


 def process_sitemap(s):
    soup = BeautifulSoup(s)
    result = []

    for loc in soup.findAll('loc'):
        result.append(loc.text)

    return result


 def is_sub_sitemap(s):
    if s.endswith('.xml') and 'sitemap' in s:
        return True
    else:
        return False


 def parse_sitemap(s):
    sitemap = process_sitemap(s)
    result = []

    while sitemap:
        candidate = sitemap.pop()

        if is_sub_sitemap(candidate):
            sub_sitemap = get_sitemap(candidate)
            for i in process_sitemap(sub_sitemap):
                sitemap.append(i)
        else:
            result.append(candidate)

    return result


 def main():
    sitemap = get_sitemap('https://www.cloudsigma.com/sitemap.xml')
    print '\n'.join(parse_sitemap(sitemap))


 if __name__ == '__main__':
    main()
	#! /usr/bin/env python
	# -- coding: utf-8 --

	"""
	Inspired by Craig Addyman (http://www.craigaddyman.com/parse-an-xml-sitemap-with-python/)
	Enhanced by Viktor Petersson (http://viktorpetersson.com) / @vpetersson
	"""

	from bs4 import BeautifulSoup
	import requests


	def get_sitemap(url):
	get_url = requests.get(url)

	if get_url.status_code == 200:
	return get_url.text
	else:
	print 'Unable to fetch sitemap: %s.' % url


	def process_sitemap(s):
	soup = BeautifulSoup(s)
	result = []

	for loc in soup.findAll('loc'):
	result.append(loc.text)

	return result


	def is_sub_sitemap(s):
	if s.endswith('.xml') and 'sitemap' in s:
	return True
	else:
	return False


	def parse_sitemap(s):
	sitemap = process_sitemap(s)
	result = []

	while sitemap:
	candidate = sitemap.pop()

	if is_sub_sitemap(candidate):
	sub_sitemap = get_sitemap(candidate)
	for i in process_sitemap(sub_sitemap):
	sitemap.append(i)
	else:
	result.append(candidate)

	return result


	def main():
	sitemap = get_sitemap('https://www.cloudsigma.com/sitemap.xml')
	print '\n'.join(parse_sitemap(sitemap))


	if __name__ == '__main__':
	main()