Last active
February 2, 2021 21:42
-
-
Save ifnull/220f103499d731c93972ec2a6944f7e3 to your computer and use it in GitHub Desktop.
MIT xPRO: DSx Data Science and Big Data Analytics: Making Data-Driven Decisions - News Downloader
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import requests | |
from datetime import datetime, timedelta | |
from newsapi import NewsApiClient | |
from bs4 import BeautifulSoup | |
# Configurations | |
newsapi = NewsApiClient(api_key='****') | |
output_path = './corpus/' | |
# Setting True will scrape each article URL for topics and full content. | |
# Scraping is specific to AP news source | |
scrape_topics_and_extended_content = True | |
# Create an output folder if it doesn't exist already | |
if not os.path.exists(output_path): | |
os.makedirs(output_path) | |
# Get the most recent 100 articles from AP since yesterday | |
yesterday = datetime.now() - timedelta(1) | |
articles = newsapi.get_everything(sources='associated-press', | |
from_param=datetime.strftime(yesterday, '%Y-%m-%d'), | |
sort_by='publishedAt', | |
page_size=100)['articles'] | |
# Iterate over articles and write corpus files. | |
for idx, article in enumerate(articles): | |
# Create title file | |
fh = open('{}title-{}.txt'.format(output_path, idx), 'w') | |
fh.write(article['title']) | |
fh.close() | |
# Create topic file | |
fh = open('{}topic-{}.txt'.format(output_path, idx), 'w') | |
# Either use the basic dataset provided by NewsAPI and manually provided topics | |
if scrape_topics_and_extended_content: | |
print('Scraping: {}'.format(article['url'])) | |
# Get article URL body | |
raw_body = requests.get(article['url']).text | |
# Parse article content | |
body = BeautifulSoup(raw_body, 'html.parser') | |
related_topics = body.findAll('a', {'data-key': 'related-tag'}) | |
topics = [] | |
for topic in related_topics: | |
topics.append(topic.text) | |
# Write topic to file as comma seperated list | |
fh.write(','.join(topics)) | |
fh.close() | |
# Parse article content and iterate over paragraphs joinign to one string. | |
plain_content = '' | |
article_content_paragraphs = body.find('div', {'data-key': 'article'}).findAll('p') | |
for child in article_content_paragraphs: | |
# Ignore elements with links to keep data clean. | |
if 'http' not in child.text: | |
plain_content = ''.join((plain_content, ' ', child.text)) | |
# Create article file from scaped data | |
fh = open('{}article-{}.txt'.format(output_path, idx), 'w') | |
fh.write(plain_content) | |
fh.close() | |
else: | |
# Create an empty file to be manually entered as a comma seperated list of topics | |
fh.write('') | |
fh.close() | |
# Create article file from article summary | |
fh = open('{}article-{}.txt'.format(output_path, idx), 'w') | |
fh.write(article['content']) | |
fh.close() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
requests==2.24.0 | |
newsapi-python==0.2.6 | |
beautifulsoup4==4.9.3 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment