Last active
August 29, 2015 14:09
-
-
Save nathanntg/5abfbba2c3f43637274b to your computer and use it in GitHub Desktop.
HTML Regular Expressions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
# match all script blocks | |
r = re.compile('<script[^>]*?>.*?</script>', re.IGNORECASE | re.DOTALL) | |
# can be used to easily remove script tags | |
html_without_scripts = r.sub('', html) | |
# match all style blocks | |
r = re.compile('<style[^>]*?>.*?</style>', re.IGNORECASE | re.DOTALL) | |
# can be used to easily remove script tags | |
html_without_styles = r.sub('', html) | |
# remove all comments | |
def remove_comments(html): | |
""" | |
Removes all comments from an HTML string. It is somewhat complex and not super elegant, but it handles | |
many of the corner cases introduces by weird internet explorer conditional comments. | |
""" | |
comm_start = html.find('<!') | |
if -1 == comm_start: | |
return html | |
# start building return string | |
ret = html[0:comm_start] | |
max_l = len(html) | |
comm_end = 0 | |
while -1 != comm_start: | |
# add content since last comment ended | |
if 0 < comm_end: | |
ret += html[comm_end:comm_start] | |
# starts with "<!-" | |
if (comm_start + 2) < max_l and '-' == html[comm_start + 2]: | |
comm_end = html.find('->', comm_start) | |
if 0 < comm_end: | |
comm_end += 2 | |
else: | |
comm_end = -1 | |
# unclear ending, count opening and closing carets | |
if -1 == comm_end: | |
c = 0 | |
i = comm_start + 1 | |
while i < max_l: | |
if '>' == html[i]: | |
if 0 < c: | |
c -= 1 | |
else: | |
comm_end = i + 1 | |
break | |
elif '<' == html[i]: | |
c += 1 | |
i += 1 | |
if 0 < c or i == max_l: | |
return ret | |
comm_start = html.find('<!', comm_end) | |
if 0 < comm_end: | |
ret += html[comm_end:] | |
return ret | |
# can be used to easily remove html comments | |
html_without_comments = remove_comments(html) | |
def remove_quotes(s): | |
if '"' == s[0] or '\'' == s[0]: | |
return s[1:-1] | |
return s | |
def get_tags(html, tag, inner=False): | |
""" | |
A light weight tool for finding all occurrences of the specified tag and returning attributes. If inner is true, | |
then it also returns the contents of the tag as entry "=" in the dictionary. Returns a list of dictionaries for | |
each tag found. | |
""" | |
r_tag = re.compile('<' + tag + '(|\\s((?:"[^"]*"|\'[^\']*\'|[^"\'/>]*)*?))' + | |
('(?:/>|>(.*?)(?:</' + tag + '\\s*>|(?=<' + tag + ')))' if inner else '/?>'), re.IGNORECASE) | |
r_attr = re.compile('\\b(\\w+)\\s*?=\\s*?("[^"]*?"|\'[^\']*?\'|[^\'">\\s]+)') | |
tags = r_tag.findall(html) | |
ret = [] | |
for t in tags: | |
entry = {} | |
if inner: | |
entry['='] = t[2] | |
attributes = r_attr.findall(t[1]) | |
for a in attributes: | |
entry[a[0].lower()] = remove_quotes(a[1]) | |
ret.append(entry) | |
return ret | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment