Last active
September 7, 2017 16:30
-
-
Save rosemichaele/af1b749a84ba17f157240a8ee568954b to your computer and use it in GitHub Desktop.
Parse input XML string to list of tags(attributes) accounting for number of parent tags w/ preceding dashes.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from xml.parsers.expat import ParserCreate, ExpatError, errors | |
from functools import partial | |
def start_element(name, attrs, out, depth): | |
depth.append(1) | |
if out.get(name, None) == None: | |
out[name] = {} | |
out[name]["attributes"] = set() | |
out[name]["loc"] = len(out) - 1 | |
out[name]["depth"] = sum(depth) | |
for a in attrs: | |
out[name]["attributes"].add(a) | |
def end_element(name, depth): | |
depth.append(-1) | |
def xmlTags(xml): | |
""" | |
Function to parse input XML. | |
:param: xml -> str: the XML to parse | |
:return: output -> list(str): the result of the parsing, see examples.txt | |
Assumptions: | |
-There is a single tag at the root level; | |
-Each tag has a single parent tag (i.e. if there are several occurrences of tag a, and in one occurrence it's a child of tag b and in the other one it's a child of tag c, then b = c); | |
-Each appearance of the same tag belongs to the same level. | |
""" | |
res = {} | |
d = [0] | |
p = ParserCreate() | |
p.StartElementHandler = partial(start_element, out=res, depth=d) | |
p.EndElementHandler = partial(end_element, depth=d) | |
try: | |
p.Parse(xml) | |
except ExpatError as err: | |
return "Error:", errors.messages[err.code] | |
output = [None] * len(res) | |
for tag in res: | |
a_str = ", ".join(sorted([a for a in res[tag]["attributes"]])) | |
depth_prefix = "--" * (res[tag]["depth"] - 1) | |
output[res[tag]["loc"]] = depth_prefix + "{n}({a})".format(n=tag, a=a_str) | |
return output |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Ex. 1: | |
xml: "<outer attr1=\"lol hello\" attr2=\"pal how are you\">hello</outer>" | |
Expected Output: ["outer(attr1, attr2)"] | |
Ex. 2: | |
xml: "<data> | |
<animal name=\"cat\"> | |
<genus>Felis</genus> | |
<family name=\"Felidae\" subfamily=\"Felinae\"/> | |
<similar name=\"tiger\" size=\"bigger\"/> | |
</animal> | |
<animal name=\"dog\"> | |
<family name=\"Canidae\" member=\"canid\"/><order>Carnivora</order> | |
<similar name=\"fox\" size=\"similar\"/> | |
</animal> | |
</data>" | |
Expected Output: ["data()", | |
"--animal(name)", | |
"----genus()", | |
"----family(member, name, subfamily)", | |
"----similar(name, size)", | |
"----order()"] | |
Ex. 3: | |
xml: "<here urlid=\"blah-blah\"> | |
<component type=\"Documents\" context=\"User\"> | |
<displayName>My Video</displayName> | |
<role role=\"Data\"> | |
<detects> | |
<detect> | |
<condition>Helper.hasObject</condition> | |
</detect> | |
</detects> | |
<rules> | |
<include filter=\"Helper.IgnoreIrrelevantLinks\"> | |
<objectSet> | |
<pattern type=\"File\"></pattern> | |
</objectSet> | |
</include> | |
</rules> | |
</role> | |
</component> | |
</here>" | |
Expected Output: ["here(urlid)", | |
"--component(context, type)", | |
"----displayName()", | |
"----role(role)", | |
"------detects()", | |
"--------detect()", | |
"----------condition()", | |
"------rules()", | |
"--------include(filter)", | |
"----------objectSet()", | |
"------------pattern(type)"] | |
Ex. 4 | |
xml: "{'this':'is not', 'xml': 1}" | |
Expected Output: ["Error:", | |
"not well-formed (invalid token)"] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment