ifnull · February 2, 2021 21:42
diff --git a/news_dataset_download.py b/news_dataset_download.py
 import os
 import requests

 from datetime import datetime, timedelta
 from newsapi import NewsApiClient
 from bs4 import BeautifulSoup

 # Configurations
 newsapi = NewsApiClient(api_key='****')
 output_path = './corpus/'

 # Setting True will scrape each article URL for topics and full content.
 # Scraping is specific to AP news source
 scrape_topics_and_extended_content = True

 # Create an output folder if it doesn't exist already
 if not os.path.exists(output_path):
    os.makedirs(output_path)


 # Get the most recent 100 articles from AP since yesterday
 yesterday = datetime.now() - timedelta(1)
 articles = newsapi.get_everything(sources='associated-press',
                                  from_param=datetime.strftime(yesterday, '%Y-%m-%d'),
                                  sort_by='publishedAt',
                                  page_size=100)['articles']

 # Iterate over articles and write corpus files.
 for idx, article in enumerate(articles):

 	# Create title file
 	fh = open('{}title-{}.txt'.format(output_path, idx), 'w')
 	fh.write(article['title'])
 	fh.close()


 	# Create topic file
 	fh = open('{}topic-{}.txt'.format(output_path, idx), 'w')

 	# Either use the basic dataset provided by NewsAPI and manually provided topics
 	if scrape_topics_and_extended_content:

 		print('Scraping: {}'.format(article['url']))

 		# Get article URL body
 		raw_body = requests.get(article['url']).text

 		# Parse article content
 		body = BeautifulSoup(raw_body, 'html.parser')
 		related_topics = body.findAll('a', {'data-key': 'related-tag'})

 		topics = []
 		
 		for topic in related_topics:
 			topics.append(topic.text)

 		# Write topic to file as comma seperated list
 		fh.write(','.join(topics))
 		fh.close()	


 		# Parse article content and iterate over paragraphs joinign to one string.
 		plain_content = ''
 		article_content_paragraphs = body.find('div', {'data-key': 'article'}).findAll('p')

 		for child in article_content_paragraphs:
 			# Ignore elements with links to keep data clean.
 			if 'http' not in child.text:
 				plain_content = ''.join((plain_content, ' ', child.text))


 		# Create article file from scaped data
 		fh = open('{}article-{}.txt'.format(output_path, idx), 'w')
 		fh.write(plain_content)
 		fh.close()

 	else:
 		# Create an empty file to be manually entered as a comma seperated list of topics
 		fh.write('')
 		fh.close()	

 		# Create article file from article summary
 		fh = open('{}article-{}.txt'.format(output_path, idx), 'w')
 		fh.write(article['content'])
 		fh.close()
diff --git a/requirements.txt b/requirements.txt
 requests==2.24.0
 newsapi-python==0.2.6
 beautifulsoup4==4.9.3
	import os
	import requests

	from datetime import datetime, timedelta
	from newsapi import NewsApiClient
	from bs4 import BeautifulSoup

	# Configurations
	newsapi = NewsApiClient(api_key='****')
	output_path = './corpus/'

	# Setting True will scrape each article URL for topics and full content.
	# Scraping is specific to AP news source
	scrape_topics_and_extended_content = True

	# Create an output folder if it doesn't exist already
	if not os.path.exists(output_path):
	os.makedirs(output_path)


	# Get the most recent 100 articles from AP since yesterday
	yesterday = datetime.now() - timedelta(1)
	articles = newsapi.get_everything(sources='associated-press',
	from_param=datetime.strftime(yesterday, '%Y-%m-%d'),
	sort_by='publishedAt',
	page_size=100)['articles']

	# Iterate over articles and write corpus files.
	for idx, article in enumerate(articles):

	# Create title file
	fh = open('{}title-{}.txt'.format(output_path, idx), 'w')
	fh.write(article['title'])
	fh.close()


	# Create topic file
	fh = open('{}topic-{}.txt'.format(output_path, idx), 'w')

	# Either use the basic dataset provided by NewsAPI and manually provided topics
	if scrape_topics_and_extended_content:

	print('Scraping: {}'.format(article['url']))

	# Get article URL body
	raw_body = requests.get(article['url']).text

	# Parse article content
	body = BeautifulSoup(raw_body, 'html.parser')
	related_topics = body.findAll('a', {'data-key': 'related-tag'})

	topics = []

	for topic in related_topics:
	topics.append(topic.text)

	# Write topic to file as comma seperated list
	fh.write(','.join(topics))
	fh.close()


	# Parse article content and iterate over paragraphs joinign to one string.
	plain_content = ''
	article_content_paragraphs = body.find('div', {'data-key': 'article'}).findAll('p')

	for child in article_content_paragraphs:
	# Ignore elements with links to keep data clean.
	if 'http' not in child.text:
	plain_content = ''.join((plain_content, ' ', child.text))


	# Create article file from scaped data
	fh = open('{}article-{}.txt'.format(output_path, idx), 'w')
	fh.write(plain_content)
	fh.close()

	else:
	# Create an empty file to be manually entered as a comma seperated list of topics
	fh.write('')
	fh.close()

	# Create article file from article summary
	fh = open('{}article-{}.txt'.format(output_path, idx), 'w')
	fh.write(article['content'])
	fh.close()