Created
March 21, 2023 01:55
-
-
Save pszemraj/356782b2e94da45acaf20b0b3ef8d903 to your computer and use it in GitHub Desktop.
Given a URL to a website, extracts all the text article from the URL formatted as markdown.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
define fn for extracting articles | |
Example usage: | |
url = 'https://www.businessinsider.com/hundreds-google-workers-walk-out-zurich-protest-over-layoffs-2023-3' | |
markdown = extract_article(url) | |
print(markdown) | |
""" | |
import requests | |
import re | |
from bs4 import BeautifulSoup | |
def extract_article(url, strip_links=True): | |
""" | |
Given a URL to a website, extracts all the text article from the URL formatted as markdown. | |
:param url: str, URL of the website to extract the article from | |
:param strip_links: bool, decides whether or not to strip links (by default true) | |
:return: str, article content in markdown format | |
""" | |
# Make a request to the URL | |
response = requests.get(url) | |
if response.status_code != 200: | |
raise Exception( | |
f"Request to {url} failed with status code {response.status_code}" | |
) | |
try: | |
# Parse the HTML content using BeautifulSoup | |
soup = BeautifulSoup(response.content, "html.parser") | |
# Find the article content | |
article = soup.find("article") | |
# Remove unwanted elements from the article | |
for element in article.find_all(["script", "style"]): | |
element.extract() | |
# Convert the article to markdown format | |
markdown = "" | |
# Add the article title | |
title_elem = article.find("h1") | |
if title_elem: | |
title = title_elem.get_text() | |
markdown += f"\n# {title}\n\n" | |
# Add the article image | |
image = article.find("img") | |
if image: | |
alt = image.get("alt") | |
src = image.get("src") | |
markdown += f"\n\n" | |
# Add the article content | |
for paragraph in article.find_all("p"): | |
text = paragraph.get_text() | |
if strip_links: | |
try: | |
text = re.sub(r"\[.*?\]\(.*?\)", "", text) # Strip links | |
except re.error as e: | |
print(f"Error stripping links from article text: {e}") | |
markdown += f"{text}\n\n" | |
# Log the number of words | |
try: | |
word_count = len(re.findall(r"\b\w+\b", markdown)) | |
print(f"The article contains {word_count} words.") | |
except TypeError: | |
print("The article content is empty.") | |
return markdown | |
except Exception as e: | |
print(f"Error extracting article content: {e}") | |
return "" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
additional cleaning ezmode: