Last active
June 16, 2023 18:59
-
-
Save rosstex/bc0df9db72833bcf6872f9ba8ec5db06 to your computer and use it in GitHub Desktop.
deterministic xpath generation of BeautifulSoup elements for web crawling
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import html | |
BAD_CHARS = set(["\"", "'", "[", "]"]) | |
# generates an xpath string for a given BeautifulSoup element | |
def soup_xpath_gen(element): | |
xpath = "" | |
while element.name != 'document': | |
if element.name == 'html': | |
xpath = "/html/" + xpath | |
break # second to last | |
else: | |
items = list(element.attrs.items()) | |
if not items: | |
el_xpath = str(element.name) | |
else: | |
el_xpath = str(element.name) + "[" | |
one = False | |
for i, (k, v) in enumerate(items): | |
if not any(char in BAD_CHARS for char in v): | |
if k == "title": | |
continue | |
if one: | |
el_xpath = el_xpath + " and " | |
one = True | |
if "/" in v: # URL matching is wonky, so we ignore | |
el_xpath += "normalize-space(@%s)" % k | |
else: | |
if isinstance(v, list): | |
v = " ".join(v) | |
el_xpath += "normalize-space(@%s)=normalize-space(\'%s\')" % (k, html.escape(v)) | |
el_xpath += "]" | |
xpath = el_xpath + "/" + xpath | |
element = element.parent | |
return xpath.rstrip("/") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you for this code.
Question. What to do when get this error:
AttributeError: 'WebElement' object has no attribute 'name'
?