Last active
December 31, 2015 04:39
-
-
Save alexstorer/7935681 to your computer and use it in GitHub Desktop.
Sloppy undocumented way to maybe get column names from an XML file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib, os, sys, re, glob, pickle | |
import xml.etree.ElementTree as ET | |
import csv | |
# use a dictionary to hold on to each entry. we can define all the dictionary elements and the xml keys that take us there. | |
def main(argv): | |
print argv | |
allNames = set() | |
for xmlname in glob.glob(sys.argv[1]+'*.xml'): | |
print xmlname | |
fname = os.path.splitext(xmlname)[0] | |
#fname = os.path.splitext(xmlname)[0] | |
floaded = False | |
x = open(xmlname,'r') | |
try: | |
print "---building parse tree..." | |
tree = ET.parse(x) | |
print "---complete!" | |
for l in tree.findall(".//{http://www.w3.org/2005/Atom}entry"): | |
d = parseSingle(l) | |
#print "parse complete" | |
#print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++" | |
#print d.keys() | |
#print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++" | |
for thiskey in d.keys(): | |
if thiskey not in allNames: | |
print "New Key:" | |
print thiskey | |
allNames = allNames.union(d.keys()) | |
except: | |
print "Problem parsing ", xmlname | |
print "------------------------------------" | |
x.seek(0,2) | |
print x.tell() | |
print "------------------------------------" | |
x.close() | |
print allNames | |
fname = 'colnames.pkl' | |
print fname | |
f = open(fname, 'wb') | |
pickle.dump(allNames,f) | |
f.close() | |
# parse a single element. | |
def parseSingle(elt): | |
# Goal: take everything with no children, and add the text and name to the dictionary. | |
#print elt | |
d = dict() | |
allLeaves = getLeaves(elt,list()); | |
#print allLeaves | |
for leaf in allLeaves: | |
#print list(leaf) | |
#print leaf | |
#print leaf.text | |
#print leaf.tag | |
newtag = re.sub("\{.*?\}","",leaf.tag) | |
#print newtag | |
d[newtag] = leaf.text | |
return d | |
# return list of leaves | |
def getLeaves(elt,allLeaves): | |
#print "Printing children of ", elt | |
#print "--------------------------------------------------------" | |
children = list(elt) | |
#print children | |
if (len(children)==0): | |
newtag = re.sub("\{.*?\}","",elt.tag) | |
if newtag=='entry.content.award.awardID.awardContractID': | |
print "\n\n\n--------------------------------------------------------" | |
print 'Tag:' | |
newtag = re.sub("\{.*?\}","",elt.tag) | |
print newtag | |
print 'Text:' | |
print elt.text | |
print 'Children:' | |
print list(elt) | |
print "--------------------------------------------------------\n\n\n" | |
if (elt.text != None) and (len(elt.text.strip())>0): | |
#print "---> tag: ", elt.tag | |
#print "---> txt: ", elt.text | |
allLeaves.extend([elt]) | |
#else: | |
#print "!!!!!!!!! not including zero length text: ", elt.tag | |
#print allLeaves | |
#print "Leaves accumulated:", len(allLeaves) | |
#print allLeaves | |
else: | |
#print "\n\nAdding all children recursively..." | |
nchildren = 0 | |
for i in children: | |
nchildren+= 1 | |
#print "****** Child ...", nchildren | |
#nextLeaves = getLeaves(i,allLeaves) | |
i.tag = elt.tag + "." + i.tag | |
nextLeaves = getLeaves(i,[]) | |
#print "****** Child ...", nchildren | |
#print "******** Leaves ...", len(nextLeaves) | |
#allLeaves.append(getLeaves(i,allLeaves)) | |
allLeaves.extend(nextLeaves) | |
#print "****** done!", nchildren | |
return allLeaves | |
if __name__ == '__main__': sys.exit(main(sys.argv)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib, os, sys, re, glob, pickle | |
import xml.etree.ElementTree as ET | |
import csv | |
# use a dictionary to hold on to each entry. we can define all the dictionary elements and the xml keys that take us there. | |
def main(argv): | |
# retrieve pickled list of key entries | |
try: | |
fname = sys.argv[2] | |
f = open(fname, 'r') | |
print fname | |
allNames = pickle.load(f) | |
l = list(allNames) | |
print "unpickled!" | |
f.close() | |
emptyDict = dict.fromkeys(allNames) | |
print "We expect to have: ", len(emptyDict), "columns in our csv." | |
except: | |
print "Problem loading pickled column names." | |
return | |
print argv | |
allNames = set() | |
for xmlname in glob.glob(sys.argv[1]+'*.xml'): | |
print xmlname | |
fname = os.path.splitext(xmlname)[0] | |
#fname = os.path.splitext(xmlname)[0] | |
floaded = False | |
x = open(xmlname,'r') | |
c = open(fname+'.csv','w') | |
try: | |
print "---building parse tree..." | |
tree = ET.parse(x) | |
print "---complete!" | |
for l in tree.findall(".//{http://www.w3.org/2005/Atom}entry"): | |
d = parseSingle(l,emptyDict) | |
print "Trying to write ", len(d), " columns to csv" | |
try: | |
if not(floaded): | |
# write the header | |
print "Writing the header..." | |
dw = csv.DictWriter(c, delimiter=',', fieldnames=d.keys()) | |
dw.writerow(dict((fn,fn) for fn in dw.fieldnames)) | |
floaded = True | |
dw.writerow(d) | |
except: | |
print "Problem writing csv." | |
#print "parse complete" | |
#print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++" | |
#print d.keys() | |
#print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++" | |
except: | |
print "Problem parsing ", xmlname | |
print "------------------------------------" | |
x.seek(0,2) | |
print x.tell() | |
print "------------------------------------" | |
x.close() | |
#print allNames | |
#fname = 'colnames.pkl' | |
#print fname | |
#f = open(fname, 'wb') | |
#pickle.dump(allNames,f) | |
#f.close() | |
# parse a single element. | |
def parseSingle(elt,d): | |
# Goal: take everything with no children, and add the text and name to the dictionary. | |
#print elt | |
allLeaves = getLeaves(elt,list()); | |
#print allLeaves | |
for leaf in allLeaves: | |
#print list(leaf) | |
#print leaf | |
#print leaf.text | |
#print leaf.tag | |
newtag = re.sub("\{.*?\}","",leaf.tag) | |
#print newtag | |
if not(newtag in d): | |
print newtag | |
d[newtag] = leaf.text | |
return d | |
# return list of leaves | |
def getLeaves(elt,allLeaves): | |
#print "Printing children of ", elt | |
#print "--------------------------------------------------------" | |
children = list(elt) | |
#print children | |
if len(children)==0: | |
#print "Adding ", elt, " to list..." | |
#print elt.text | |
if (elt.text != None) and len(elt.text.strip())>0: | |
#print "---> tag: ", elt.tag | |
#print "---> txt: ", elt.text | |
allLeaves.extend([elt]) | |
#else: | |
#print "!!!!!!!!! not including zero length text: ", elt.tag | |
#print allLeaves | |
#print "Leaves accumulated:", len(allLeaves) | |
#print allLeaves | |
else: | |
#print "\n\nAdding all children recursively..." | |
nchildren = 0 | |
for i in children: | |
nchildren+= 1 | |
#print "****** Child ...", nchildren | |
#nextLeaves = getLeaves(i,allLeaves) | |
i.tag = elt.tag + "." + i.tag | |
nextLeaves = getLeaves(i,[]) | |
#print "****** Child ...", nchildren | |
#print "******** Leaves ...", len(nextLeaves) | |
#allLeaves.append(getLeaves(i,allLeaves)) | |
allLeaves.extend(nextLeaves) | |
#print "****** done!", nchildren | |
return allLeaves | |
if __name__ == '__main__': sys.exit(main(sys.argv)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment