Created
October 22, 2024 17:42
-
-
Save adosib/156859e23e832e6065a6dd186f564bdf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BEGIN; | |
/* UDF: | |
Purpose: Extract just the text from a field storing HTML | |
*/ | |
CREATE OR REPLACE FUNCTION udf_extract_text_from_html(html_content VARCHAR(MAX)) | |
RETURNS varchar(max) | |
STABLE | |
AS $$ | |
import re | |
from HTMLParser import HTMLParser | |
class HTMLTextExtractor(HTMLParser, object): | |
block_level_html_tags = ( | |
'p', | |
'div', | |
'blockquote', | |
'pre', | |
'li', | |
'ul', | |
'ol', | |
'dl', | |
'dt', | |
'dd', | |
'address', | |
'fieldset', | |
'section', | |
'article', | |
'aside', | |
'header', | |
'footer', | |
'nav', | |
'figure', | |
'figcaption', | |
'main' | |
) | |
def __init__(self): | |
super(HTMLTextExtractor, self).__init__() | |
self.fed = [] | |
self.in_style_or_script = False | |
def handle_starttag(self, tag, attrs): | |
if tag in ('style', 'script'): | |
self.in_style_or_script = True | |
elif tag in HTMLTextExtractor.block_level_html_tags: | |
self.fed.append('\n') | |
def handle_endtag(self, tag): | |
if tag in ('style', 'script'): | |
self.in_style_or_script = False | |
elif tag in HTMLTextExtractor.block_level_html_tags: | |
self.fed.append('\n') | |
def handle_data(self, data): | |
if not self.in_style_or_script: | |
self.fed.append(data) | |
def get_data(self): | |
return ''.join(self.fed) | |
def extract_text(html_content): | |
parser = HTMLTextExtractor() | |
parser.feed(html_content) | |
text = parser.get_data() | |
# Replace consecutive whitespace and \r\n characters that contain at least one \n with a single newline | |
formatted_text = re.sub(r'[\r\n]+|\s*\n\s*', '\n', text).strip() | |
return formatted_text | |
return extract_text(html_content) | |
$$ LANGUAGE plpythonu; | |
COMMIT; | |
END; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment