Created
August 3, 2018 15:27
-
-
Save HebeHH/d4036d866f48c8294ab31be99223e756 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import praw | |
import re | |
import pandas as pd | |
# connect to reddit | |
reddit = praw.Reddit(client_id='my_id', client_secret='my_secret', user_agent='me') | |
# get new submissions from News | |
submissions = [] | |
for submission in reddit.subreddit("News").new(limit = None): | |
submissions.append([submission.title, submission.score]) | |
submissions = pd.DataFrame(submissions, columns = ['titles', 'scores']) | |
# get most common proper nouns | |
stop_words = 'In|The|Man|New|What|My|This|Woman|Best|Why|How|You|Is|Part|To|After|First|No|Boy' | |
all_titles = re.sub(stop_words, "", " ". join(submissions.titles)) | |
ProperNouns = re.findall(r'[A-Z][a-z]+', all_titles) | |
search_terms = pd.Series(ProperNouns).value_counts().nlargest(10).index.values | |
# return all submissions referencing one of the search terms with more than 200 upvotes | |
for term in search_terms: | |
print "\n\n Titles about " + term + ":" | |
print submissions[(submissions.titles.str.contains("(?i)"+term)) & (submissions['scores'] >= 100)] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment