Created
March 28, 2014 05:02
-
-
Save ieatkillerbees/9825724 to your computer and use it in GitHub Desktop.
This parses open-data stuff for the EU. Kinda, most of the time. YMMV. Outputs an excel-ready CSV file, because reasons. This is probably not relevant for anyone but me, but if you need it, use it! :)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup, UnicodeDammit | |
import csv | |
import sys | |
# This parses open-data stuff for the EU. Kinda, most of the time. YMMV. Outputs an excel-ready CSV file, because reasons. | |
# This is probably not relevant for anyone but me, but if you need it, use it! :) | |
__author__ = "Samantha Quinones" | |
__email__ = "[email protected]" | |
__license__= "Public Domain" | |
def get_val(element): | |
"""Return the string content of an element if it exists""" | |
try: | |
string = element.string | |
return string | |
except AttributeError: | |
return u"" | |
def get_person(element): | |
"""Return a tuple of stuff about a person""" | |
name = "%s %s %s" % (get_val(element.title), get_val(element.firstName), get_val(element.lastName)) | |
return (name, get_val(element.position)) | |
def get_contact(element): | |
"""Return a tuple of contact info""" | |
address = "%s %s, %s %s, %s" % (get_val(element.number), get_val(element.street), get_val(element.town), get_val(element.postCode), | |
get_val(element.country)) | |
phone = "(+%s) %s" % (get_val(element.indicPhone), get_val(element.phoneNumber)) | |
fax = "(+%s) %s" % (get_val(element.fax.indicFax), get_val(element.fax.fax)) | |
more = get_val(element.moreContactDetails) | |
return (address, phone, fax, more) | |
def flatten_list(element): | |
"""Extract elements of an indexed list""" | |
actions = [] | |
for item in element: | |
try: | |
action = item.string | |
except: | |
action = "" | |
if action != "": | |
actions.append(action) | |
return ";".join(actions) | |
def get_financials(element): | |
"""Return a tuple of financial information""" | |
start = get_val(element.startDate)[0:4] | |
end = get_val(element.endDate)[0:4] | |
year = "%s-%s" % (start, end) | |
try: | |
cost_min = get_val(element.financialInformation.turnover.range.min) | |
except: | |
cost_min = "0.0" | |
try: | |
cost_max = get_val(element.financialInformation.turnover.range.max) | |
except: | |
cost_max = "0.0" | |
cost_range = "%s - %s" % (cost_min, cost_max) | |
return (year, cost_range, get_val(element.eurSourcesProcurement), get_val(element.eurSourcesGrants), get_val(element.otherFinancialInformation)) | |
def get_header(): | |
"""These are the fields that we're going to write out. Yay!""" | |
return [ | |
"name/company name", | |
"identificationCode", | |
"registration date", | |
"acronym", | |
"legal status", | |
"website address", | |
"section", | |
"and more precisely", | |
"surname, name", | |
"position", | |
"contact details", | |
"other contact info", | |
"goals/remit", | |
"number of persons", | |
"complementary info", | |
"initiatives", | |
"fields", | |
"financial year", | |
"estimated costs", | |
"procurement", | |
"grants", | |
"other financial info" | |
] | |
def parse(): | |
"""Parse the xml document""" | |
file = open("full_export_new.xml") | |
soup = BeautifulSoup(file, "xml") | |
for rep in soup.ListOfIRPublicDetail.resultList: | |
r = [] | |
r.append(get_val(rep.find("originalName"))) | |
r.append(get_val(rep.identificationCode)) | |
r.append(get_val(rep.registrationDate.string)) | |
r.append(get_val(rep.acronym)) | |
r.append(get_val(rep.legalStatus)) | |
try: | |
r.append(rep.webSiteURL["ns2:href"]) | |
except: | |
r.append(u'') | |
r.append(get_val(rep.category.mainCategory)) | |
r.append(get_val(rep.category.subCategory)) | |
for item in get_person(rep.legal): | |
r.append(item) | |
for item in get_contact(rep.contactDetails): | |
r.append(item) | |
r.append(get_val(rep.goals)) | |
r.append(flatten_list(rep.actionFields)) | |
r.append(get_val(rep.codeOfConduct.members)) | |
r.append(get_val(rep.codeOfConduct.infoMembers)) | |
r.append(get_val(rep.activities)) | |
r.append(flatten_list(rep.interests)) | |
r.append(get_val(rep.networking)) | |
for item in get_financials(rep.financialData): | |
r.append(item) | |
yield [s.encode("utf-8") for s in r] | |
if __name__ == "__main__": | |
print "Reading XML..." | |
with open('full_export_new.csv', 'wb') as csvfile: | |
output = csv.writer(csvfile, dialect="excel", quoting=csv.QUOTE_ALL) | |
print "Writing header..." | |
output.writerow(get_header()) | |
sys.stdout.write("Writing records...") | |
for row in parse(): | |
sys.stdout.write(".") | |
output.writerow(row) | |
print "Transformation complete!" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment