alexstorer · December 31, 2015 04:39
diff --git a/colfromxml.py b/colfromxml.py
 import urllib, os, sys, re, glob, pickle
 import xml.etree.ElementTree as ET
 import csv

 # use a dictionary to hold on to each entry.  we can define all the dictionary elements and the xml keys that take us there.

 def main(argv):
    print argv
    allNames = set()
    for xmlname in glob.glob(sys.argv[1]+'*.xml'):
        print xmlname
        fname = os.path.splitext(xmlname)[0]
        #fname = os.path.splitext(xmlname)[0]
        floaded = False
        x = open(xmlname,'r')
        try:
            print "---building parse tree..."
            tree = ET.parse(x)
            print "---complete!"
            for l in tree.findall(".//{http://www.w3.org/2005/Atom}entry"):
                d = parseSingle(l)
                #print "parse complete"
            #print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++"
            #print d.keys()
            #print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++"
                for thiskey in d.keys():
                    if thiskey not in allNames:
                        print "New Key:"                        
                        print thiskey

                allNames = allNames.union(d.keys())
        except:
            print "Problem parsing ", xmlname
            print "------------------------------------"
            x.seek(0,2)
            print x.tell()
            print "------------------------------------"
        x.close()

    print allNames
    fname = 'colnames.pkl'
    print fname
    f = open(fname, 'wb')
    pickle.dump(allNames,f)
    f.close()

 # parse a single element.
 def parseSingle(elt):
    # Goal: take everything with no children, and add the text and name to the dictionary.
    #print elt
    d = dict()

    allLeaves = getLeaves(elt,list());
    #print allLeaves

    for leaf in allLeaves:
        #print list(leaf)
        #print leaf
        #print leaf.text
        #print leaf.tag
        newtag = re.sub("\{.*?\}","",leaf.tag)
        #print newtag
        d[newtag] = leaf.text

    return d

 # return list of leaves
 def getLeaves(elt,allLeaves):
    #print "Printing children of ", elt
    #print "--------------------------------------------------------"
    children = list(elt)
    #print children
    if (len(children)==0):
        newtag = re.sub("\{.*?\}","",elt.tag)
        if newtag=='entry.content.award.awardID.awardContractID':
            print "\n\n\n--------------------------------------------------------"
            print 'Tag:'
            newtag = re.sub("\{.*?\}","",elt.tag)
            print newtag
            print 'Text:'
            print elt.text
            print 'Children:'
            print list(elt)
            print "--------------------------------------------------------\n\n\n"
        if (elt.text != None) and (len(elt.text.strip())>0):
            #print "---> tag: ", elt.tag
            #print "---> txt: ", elt.text
            allLeaves.extend([elt])
        #else:
            #print "!!!!!!!!! not including zero length text: ", elt.tag
        #print allLeaves
        #print "Leaves accumulated:", len(allLeaves)
        #print allLeaves
    else:
        #print "\n\nAdding all children recursively..."
        nchildren = 0
        for i in children:
            nchildren+= 1
            #print "****** Child ...", nchildren
            #nextLeaves = getLeaves(i,allLeaves)
            i.tag = elt.tag + "." + i.tag
            nextLeaves = getLeaves(i,[])
            #print "****** Child ...", nchildren
            #print "******** Leaves ...", len(nextLeaves)
            #allLeaves.append(getLeaves(i,allLeaves))
            allLeaves.extend(nextLeaves)
            #print "****** done!", nchildren
    return allLeaves

 if __name__ == '__main__': sys.exit(main(sys.argv))
diff --git a/xml2csv_fromdict.py b/xml2csv_fromdict.py
 import urllib, os, sys, re, glob, pickle
 import xml.etree.ElementTree as ET
 import csv

 # use a dictionary to hold on to each entry.  we can define all the dictionary elements and the xml keys that take us there.

 def main(argv):
    # retrieve pickled list of key entries
    try:
        fname = sys.argv[2]
        f = open(fname, 'r')
        print fname
        allNames = pickle.load(f)
        l = list(allNames)
        print "unpickled!"
        f.close()
        emptyDict = dict.fromkeys(allNames)
        print "We expect to have: ", len(emptyDict), "columns in our csv."
    except:
        print "Problem loading pickled column names."
        return
    


    print argv
    allNames = set()
    for xmlname in glob.glob(sys.argv[1]+'*.xml'):
        print xmlname
        fname = os.path.splitext(xmlname)[0]
        #fname = os.path.splitext(xmlname)[0]
        floaded = False
        x = open(xmlname,'r')
        c = open(fname+'.csv','w')
        try:
            print "---building parse tree..."
            tree = ET.parse(x)
            print "---complete!"
            for l in tree.findall(".//{http://www.w3.org/2005/Atom}entry"):
                d = parseSingle(l,emptyDict)
                print "Trying to write ", len(d), " columns to csv"
                try:
                    if not(floaded):
                        # write the header
                        print "Writing the header..."
                        dw = csv.DictWriter(c, delimiter=',', fieldnames=d.keys())
                        dw.writerow(dict((fn,fn) for fn in dw.fieldnames))
                        floaded = True
                    dw.writerow(d)
                except:
                    print "Problem writing csv."
                
                #print "parse complete"
            #print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++"
            #print d.keys()
            #print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++"
        except:
            print "Problem parsing ", xmlname
            print "------------------------------------"
            x.seek(0,2)
            print x.tell()
            print "------------------------------------"
        x.close()

    #print allNames
    #fname = 'colnames.pkl'
    #print fname
    #f = open(fname, 'wb')
    #pickle.dump(allNames,f)
    #f.close()

 # parse a single element.
 def parseSingle(elt,d):
    # Goal: take everything with no children, and add the text and name to the dictionary.
    #print elt
    allLeaves = getLeaves(elt,list());
    #print allLeaves

    for leaf in allLeaves:
        #print list(leaf)
        #print leaf
        #print leaf.text
        #print leaf.tag
        newtag = re.sub("\{.*?\}","",leaf.tag)
        #print newtag
        if not(newtag in d):
            print newtag
        d[newtag] = leaf.text

    return d

 # return list of leaves
 def getLeaves(elt,allLeaves):
    #print "Printing children of ", elt
    #print "--------------------------------------------------------"
    children = list(elt)
    #print children
    if len(children)==0:
        #print "Adding ", elt, " to list..."
        #print elt.text
        if (elt.text != None) and len(elt.text.strip())>0:
            #print "---> tag: ", elt.tag
            #print "---> txt: ", elt.text
            allLeaves.extend([elt])
        #else:
            #print "!!!!!!!!! not including zero length text: ", elt.tag
        #print allLeaves
        #print "Leaves accumulated:", len(allLeaves)
        #print allLeaves
    else:
        #print "\n\nAdding all children recursively..."
        nchildren = 0
        for i in children:
            nchildren+= 1
            #print "****** Child ...", nchildren
            #nextLeaves = getLeaves(i,allLeaves)
            i.tag = elt.tag + "." + i.tag
            nextLeaves = getLeaves(i,[])
            #print "****** Child ...", nchildren
            #print "******** Leaves ...", len(nextLeaves)
            #allLeaves.append(getLeaves(i,allLeaves))
            allLeaves.extend(nextLeaves)
            #print "****** done!", nchildren
    return allLeaves

 if __name__ == '__main__': sys.exit(main(sys.argv))
	import urllib, os, sys, re, glob, pickle
	import xml.etree.ElementTree as ET
	import csv

	# use a dictionary to hold on to each entry. we can define all the dictionary elements and the xml keys that take us there.

	def main(argv):
	print argv
	allNames = set()
	for xmlname in glob.glob(sys.argv[1]+'*.xml'):
	print xmlname
	fname = os.path.splitext(xmlname)[0]
	#fname = os.path.splitext(xmlname)[0]
	floaded = False
	x = open(xmlname,'r')
	try:
	print "---building parse tree..."
	tree = ET.parse(x)
	print "---complete!"
	for l in tree.findall(".//{http://www.w3.org/2005/Atom}entry"):
	d = parseSingle(l)
	#print "parse complete"
	#print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++"
	#print d.keys()
	#print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++"
	for thiskey in d.keys():
	if thiskey not in allNames:
	print "New Key:"
	print thiskey

	allNames = allNames.union(d.keys())
	except:
	print "Problem parsing ", xmlname
	print "------------------------------------"
	x.seek(0,2)
	print x.tell()
	print "------------------------------------"
	x.close()

	print allNames
	fname = 'colnames.pkl'
	print fname
	f = open(fname, 'wb')
	pickle.dump(allNames,f)
	f.close()

	# parse a single element.
	def parseSingle(elt):
	# Goal: take everything with no children, and add the text and name to the dictionary.
	#print elt
	d = dict()

	allLeaves = getLeaves(elt,list());
	#print allLeaves

	for leaf in allLeaves:
	#print list(leaf)
	#print leaf
	#print leaf.text
	#print leaf.tag
	newtag = re.sub("\{.*?\}","",leaf.tag)
	#print newtag
	d[newtag] = leaf.text

	return d

	# return list of leaves
	def getLeaves(elt,allLeaves):
	#print "Printing children of ", elt
	#print "--------------------------------------------------------"
	children = list(elt)
	#print children
	if (len(children)==0):
	newtag = re.sub("\{.*?\}","",elt.tag)
	if newtag=='entry.content.award.awardID.awardContractID':
	print "\n\n\n--------------------------------------------------------"
	print 'Tag:'
	newtag = re.sub("\{.*?\}","",elt.tag)
	print newtag
	print 'Text:'
	print elt.text
	print 'Children:'
	print list(elt)
	print "--------------------------------------------------------\n\n\n"
	if (elt.text != None) and (len(elt.text.strip())>0):
	#print "---> tag: ", elt.tag
	#print "---> txt: ", elt.text
	allLeaves.extend([elt])
	#else:
	#print "!!!!!!!!! not including zero length text: ", elt.tag
	#print allLeaves
	#print "Leaves accumulated:", len(allLeaves)
	#print allLeaves
	else:
	#print "\n\nAdding all children recursively..."
	nchildren = 0
	for i in children:
	nchildren+= 1
	#print "****** Child ...", nchildren
	#nextLeaves = getLeaves(i,allLeaves)
	i.tag = elt.tag + "." + i.tag
	nextLeaves = getLeaves(i,[])
	#print "****** Child ...", nchildren
	#print "******** Leaves ...", len(nextLeaves)
	#allLeaves.append(getLeaves(i,allLeaves))
	allLeaves.extend(nextLeaves)
	#print "****** done!", nchildren
	return allLeaves

	if __name__ == '__main__': sys.exit(main(sys.argv))