ieatkillerbees · March 28, 2014 05:02
diff --git a/eu_opendata_xformer b/eu_opendata_xformer
 from bs4 import BeautifulSoup, UnicodeDammit
 import csv
 import sys

 # This parses open-data stuff for the EU. Kinda, most of the time. YMMV. Outputs an excel-ready CSV file, because reasons. 
 # This is probably not relevant for anyone but me, but if you need it, use it! :)

 __author__ = "Samantha Quinones"
 __email__  = "[email protected]"
 __license__= "Public Domain"

 def get_val(element):
 	"""Return the string content of an element if it exists"""
 	try:
 		string = element.string
 		return string
 	except AttributeError:
 		return u""

 def get_person(element):
 	"""Return a tuple of stuff about a person"""
 	name = "%s %s %s" % (get_val(element.title), get_val(element.firstName), get_val(element.lastName))
 	return (name, get_val(element.position))

 def get_contact(element):
 	"""Return a tuple of contact info"""
 	address = "%s %s, %s %s, %s" % (get_val(element.number), get_val(element.street), get_val(element.town), get_val(element.postCode), 
 									get_val(element.country))
 	phone   = "(+%s) %s" % (get_val(element.indicPhone), get_val(element.phoneNumber))
 	fax     = "(+%s) %s" % (get_val(element.fax.indicFax), get_val(element.fax.fax))
 	more    = get_val(element.moreContactDetails)
 	return (address, phone, fax, more)

 def flatten_list(element):
 	"""Extract elements of an indexed list"""
 	actions = []
 	for item in element:
 		try:
 			action = item.string
 		except:
 			action = ""
 		if action != "":
 			actions.append(action)

 	return ";".join(actions)

 def get_financials(element):	
 	"""Return a tuple of financial information"""
 	start = get_val(element.startDate)[0:4]
 	end   = get_val(element.endDate)[0:4]
 	year  = "%s-%s" % (start, end)

 	try:
 		cost_min = get_val(element.financialInformation.turnover.range.min)
 	except:
 		cost_min = "0.0"

 	try:
 		cost_max = get_val(element.financialInformation.turnover.range.max)
 	except:
 		cost_max = "0.0"

 	cost_range = "%s - %s" % (cost_min, cost_max)

 	return (year, cost_range, get_val(element.eurSourcesProcurement), get_val(element.eurSourcesGrants), get_val(element.otherFinancialInformation))


 def get_header():
 	"""These are the fields that we're going to write out. Yay!"""
 	return [
 		"name/company name",
 		"identificationCode",
 		"registration date",
 		"acronym",
 		"legal status",
 		"website address",
 		"section",
 		"and more precisely",
 		"surname, name",
 		"position",
 		"contact details",
 		"other contact info",
 		"goals/remit",
 		"number of persons",
 		"complementary info",
 		"initiatives",
 		"fields",
 		"financial year",
 		"estimated costs",
 		"procurement",
 		"grants",
 		"other financial info"
 	]

 def parse():
 	"""Parse the xml document"""
 	file = open("full_export_new.xml")
 	soup = BeautifulSoup(file, "xml")
 	for rep in soup.ListOfIRPublicDetail.resultList:
 		r = []
 		r.append(get_val(rep.find("originalName")))
 		r.append(get_val(rep.identificationCode))
 		r.append(get_val(rep.registrationDate.string))
 		r.append(get_val(rep.acronym))
 		r.append(get_val(rep.legalStatus))

 		try:
 			r.append(rep.webSiteURL["ns2:href"])
 		except:
 			r.append(u'')

 		r.append(get_val(rep.category.mainCategory))
 		r.append(get_val(rep.category.subCategory))

 		for item in get_person(rep.legal):
 			r.append(item)

 		for item in get_contact(rep.contactDetails):
 			r.append(item)

 		r.append(get_val(rep.goals))
 		r.append(flatten_list(rep.actionFields))

 		r.append(get_val(rep.codeOfConduct.members))
 		r.append(get_val(rep.codeOfConduct.infoMembers))
 		r.append(get_val(rep.activities))

 		r.append(flatten_list(rep.interests))

 		r.append(get_val(rep.networking))

 		for item in get_financials(rep.financialData):
 			r.append(item)

 		yield [s.encode("utf-8") for s in r]

 if __name__ == "__main__":
 	print "Reading XML..."
 	with open('full_export_new.csv', 'wb') as csvfile:
 	    output = csv.writer(csvfile, dialect="excel", quoting=csv.QUOTE_ALL)
 	    print "Writing header..."
 	    output.writerow(get_header())
 	    sys.stdout.write("Writing records...")
 	    for row in parse():
 	    	sys.stdout.write(".")
 	    	output.writerow(row)
 	print "Transformation complete!"
	from bs4 import BeautifulSoup, UnicodeDammit
	import csv
	import sys

	# This parses open-data stuff for the EU. Kinda, most of the time. YMMV. Outputs an excel-ready CSV file, because reasons.
	# This is probably not relevant for anyone but me, but if you need it, use it! :)

	__author__ = "Samantha Quinones"
	__email__ = "[email protected]"
	__license__= "Public Domain"

	def get_val(element):
	"""Return the string content of an element if it exists"""
	try:
	string = element.string
	return string
	except AttributeError:
	return u""

	def get_person(element):
	"""Return a tuple of stuff about a person"""
	name = "%s %s %s" % (get_val(element.title), get_val(element.firstName), get_val(element.lastName))
	return (name, get_val(element.position))

	def get_contact(element):
	"""Return a tuple of contact info"""
	address = "%s %s, %s %s, %s" % (get_val(element.number), get_val(element.street), get_val(element.town), get_val(element.postCode),
	get_val(element.country))
	phone = "(+%s) %s" % (get_val(element.indicPhone), get_val(element.phoneNumber))
	fax = "(+%s) %s" % (get_val(element.fax.indicFax), get_val(element.fax.fax))
	more = get_val(element.moreContactDetails)
	return (address, phone, fax, more)

	def flatten_list(element):
	"""Extract elements of an indexed list"""
	actions = []
	for item in element:
	try:
	action = item.string
	except:
	action = ""
	if action != "":
	actions.append(action)

	return ";".join(actions)

	def get_financials(element):
	"""Return a tuple of financial information"""
	start = get_val(element.startDate)[0:4]
	end = get_val(element.endDate)[0:4]
	year = "%s-%s" % (start, end)

	try:
	cost_min = get_val(element.financialInformation.turnover.range.min)
	except:
	cost_min = "0.0"

	try:
	cost_max = get_val(element.financialInformation.turnover.range.max)
	except:
	cost_max = "0.0"

	cost_range = "%s - %s" % (cost_min, cost_max)

	return (year, cost_range, get_val(element.eurSourcesProcurement), get_val(element.eurSourcesGrants), get_val(element.otherFinancialInformation))


	def get_header():
	"""These are the fields that we're going to write out. Yay!"""
	return [
	"name/company name",
	"identificationCode",
	"registration date",
	"acronym",
	"legal status",
	"website address",
	"section",
	"and more precisely",
	"surname, name",
	"position",
	"contact details",
	"other contact info",
	"goals/remit",
	"number of persons",
	"complementary info",
	"initiatives",
	"fields",
	"financial year",
	"estimated costs",
	"procurement",
	"grants",
	"other financial info"
	]

	def parse():
	"""Parse the xml document"""
	file = open("full_export_new.xml")
	soup = BeautifulSoup(file, "xml")
	for rep in soup.ListOfIRPublicDetail.resultList:
	r = []
	r.append(get_val(rep.find("originalName")))
	r.append(get_val(rep.identificationCode))
	r.append(get_val(rep.registrationDate.string))
	r.append(get_val(rep.acronym))
	r.append(get_val(rep.legalStatus))

	try:
	r.append(rep.webSiteURL["ns2:href"])
	except:
	r.append(u'')

	r.append(get_val(rep.category.mainCategory))
	r.append(get_val(rep.category.subCategory))

	for item in get_person(rep.legal):
	r.append(item)

	for item in get_contact(rep.contactDetails):
	r.append(item)

	r.append(get_val(rep.goals))
	r.append(flatten_list(rep.actionFields))

	r.append(get_val(rep.codeOfConduct.members))
	r.append(get_val(rep.codeOfConduct.infoMembers))
	r.append(get_val(rep.activities))

	r.append(flatten_list(rep.interests))

	r.append(get_val(rep.networking))

	for item in get_financials(rep.financialData):
	r.append(item)

	yield [s.encode("utf-8") for s in r]

	if __name__ == "__main__":
	print "Reading XML..."
	with open('full_export_new.csv', 'wb') as csvfile:
	output = csv.writer(csvfile, dialect="excel", quoting=csv.QUOTE_ALL)
	print "Writing header..."
	output.writerow(get_header())
	sys.stdout.write("Writing records...")
	for row in parse():
	sys.stdout.write(".")
	output.writerow(row)
	print "Transformation complete!"