Skip to content

Instantly share code, notes, and snippets.

@ieatkillerbees
Created March 28, 2014 05:02
Show Gist options
  • Save ieatkillerbees/9825724 to your computer and use it in GitHub Desktop.
Save ieatkillerbees/9825724 to your computer and use it in GitHub Desktop.
This parses open-data stuff for the EU. Kinda, most of the time. YMMV. Outputs an excel-ready CSV file, because reasons. This is probably not relevant for anyone but me, but if you need it, use it! :)
from bs4 import BeautifulSoup, UnicodeDammit
import csv
import sys
# This parses open-data stuff for the EU. Kinda, most of the time. YMMV. Outputs an excel-ready CSV file, because reasons.
# This is probably not relevant for anyone but me, but if you need it, use it! :)
__author__ = "Samantha Quinones"
__email__ = "[email protected]"
__license__= "Public Domain"
def get_val(element):
"""Return the string content of an element if it exists"""
try:
string = element.string
return string
except AttributeError:
return u""
def get_person(element):
"""Return a tuple of stuff about a person"""
name = "%s %s %s" % (get_val(element.title), get_val(element.firstName), get_val(element.lastName))
return (name, get_val(element.position))
def get_contact(element):
"""Return a tuple of contact info"""
address = "%s %s, %s %s, %s" % (get_val(element.number), get_val(element.street), get_val(element.town), get_val(element.postCode),
get_val(element.country))
phone = "(+%s) %s" % (get_val(element.indicPhone), get_val(element.phoneNumber))
fax = "(+%s) %s" % (get_val(element.fax.indicFax), get_val(element.fax.fax))
more = get_val(element.moreContactDetails)
return (address, phone, fax, more)
def flatten_list(element):
"""Extract elements of an indexed list"""
actions = []
for item in element:
try:
action = item.string
except:
action = ""
if action != "":
actions.append(action)
return ";".join(actions)
def get_financials(element):
"""Return a tuple of financial information"""
start = get_val(element.startDate)[0:4]
end = get_val(element.endDate)[0:4]
year = "%s-%s" % (start, end)
try:
cost_min = get_val(element.financialInformation.turnover.range.min)
except:
cost_min = "0.0"
try:
cost_max = get_val(element.financialInformation.turnover.range.max)
except:
cost_max = "0.0"
cost_range = "%s - %s" % (cost_min, cost_max)
return (year, cost_range, get_val(element.eurSourcesProcurement), get_val(element.eurSourcesGrants), get_val(element.otherFinancialInformation))
def get_header():
"""These are the fields that we're going to write out. Yay!"""
return [
"name/company name",
"identificationCode",
"registration date",
"acronym",
"legal status",
"website address",
"section",
"and more precisely",
"surname, name",
"position",
"contact details",
"other contact info",
"goals/remit",
"number of persons",
"complementary info",
"initiatives",
"fields",
"financial year",
"estimated costs",
"procurement",
"grants",
"other financial info"
]
def parse():
"""Parse the xml document"""
file = open("full_export_new.xml")
soup = BeautifulSoup(file, "xml")
for rep in soup.ListOfIRPublicDetail.resultList:
r = []
r.append(get_val(rep.find("originalName")))
r.append(get_val(rep.identificationCode))
r.append(get_val(rep.registrationDate.string))
r.append(get_val(rep.acronym))
r.append(get_val(rep.legalStatus))
try:
r.append(rep.webSiteURL["ns2:href"])
except:
r.append(u'')
r.append(get_val(rep.category.mainCategory))
r.append(get_val(rep.category.subCategory))
for item in get_person(rep.legal):
r.append(item)
for item in get_contact(rep.contactDetails):
r.append(item)
r.append(get_val(rep.goals))
r.append(flatten_list(rep.actionFields))
r.append(get_val(rep.codeOfConduct.members))
r.append(get_val(rep.codeOfConduct.infoMembers))
r.append(get_val(rep.activities))
r.append(flatten_list(rep.interests))
r.append(get_val(rep.networking))
for item in get_financials(rep.financialData):
r.append(item)
yield [s.encode("utf-8") for s in r]
if __name__ == "__main__":
print "Reading XML..."
with open('full_export_new.csv', 'wb') as csvfile:
output = csv.writer(csvfile, dialect="excel", quoting=csv.QUOTE_ALL)
print "Writing header..."
output.writerow(get_header())
sys.stdout.write("Writing records...")
for row in parse():
sys.stdout.write(".")
output.writerow(row)
print "Transformation complete!"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment