Skip to content

Instantly share code, notes, and snippets.

@adosib
Created October 22, 2024 17:42
Show Gist options
  • Save adosib/156859e23e832e6065a6dd186f564bdf to your computer and use it in GitHub Desktop.
Save adosib/156859e23e832e6065a6dd186f564bdf to your computer and use it in GitHub Desktop.
BEGIN;
/* UDF:
Purpose: Extract just the text from a field storing HTML
*/
CREATE OR REPLACE FUNCTION udf_extract_text_from_html(html_content VARCHAR(MAX))
RETURNS varchar(max)
STABLE
AS $$
import re
from HTMLParser import HTMLParser
class HTMLTextExtractor(HTMLParser, object):
block_level_html_tags = (
'p',
'div',
'blockquote',
'pre',
'li',
'ul',
'ol',
'dl',
'dt',
'dd',
'address',
'fieldset',
'section',
'article',
'aside',
'header',
'footer',
'nav',
'figure',
'figcaption',
'main'
)
def __init__(self):
super(HTMLTextExtractor, self).__init__()
self.fed = []
self.in_style_or_script = False
def handle_starttag(self, tag, attrs):
if tag in ('style', 'script'):
self.in_style_or_script = True
elif tag in HTMLTextExtractor.block_level_html_tags:
self.fed.append('\n')
def handle_endtag(self, tag):
if tag in ('style', 'script'):
self.in_style_or_script = False
elif tag in HTMLTextExtractor.block_level_html_tags:
self.fed.append('\n')
def handle_data(self, data):
if not self.in_style_or_script:
self.fed.append(data)
def get_data(self):
return ''.join(self.fed)
def extract_text(html_content):
parser = HTMLTextExtractor()
parser.feed(html_content)
text = parser.get_data()
# Replace consecutive whitespace and \r\n characters that contain at least one \n with a single newline
formatted_text = re.sub(r'[\r\n]+|\s*\n\s*', '\n', text).strip()
return formatted_text
return extract_text(html_content)
$$ LANGUAGE plpythonu;
COMMIT;
END;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment