Last active
August 3, 2018 15:27
-
-
Save HebeHH/0ddf4fac3c1466b784cc51b4e6162bfa to your computer and use it in GitHub Desktop.
Search through specific subreddits from the command line. Returns titles and vote scores only
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import praw | |
import re | |
from tqdm import tqdm | |
import pandas as pd | |
pd.options.display.max_colwidth = 70 | |
# pick subreddits, any delimiter but letter, number and new line is okay (will default to r/news if no valid input) | |
subreddits = "+".join(re.findall(r'[A-Za-z0-9]+', raw_input("Enter desired subreddits on one line\n"))) | |
print "Your subreddits are: " + subreddits | |
# connect to reddit - needs your own credentials | |
reddit = praw.Reddit(client_id='ur_cliend_id', | |
client_secret='ur_client_secret', | |
user_agent='Hi it me') | |
# get submissions from subreddits according to user-selected sort | |
search_type = raw_input("Pick your search type. Options: 'hot' (default), 'new', 'top'\n") | |
def get_submissions(search_type, subreddits): | |
if search_type.lower() == 'new': | |
return reddit.subreddit(subreddits).new(limit = None) | |
elif search_type.lower() == 'top': | |
return reddit.subreddit(subreddits).top(limit = None) | |
else: | |
return reddit.subreddit(subreddits).hot(limit = None) | |
print "This may take a bit" | |
submissions = [] | |
# if you want more info than title and score, edit this bit | |
try: | |
#tqdm inaccurate as submissions are lazily evaluated, but lets user know shit is happening | |
for submission in tqdm(get_submissions(search_type, subreddits)): | |
submissions.append([submission.title, submission.score]) | |
except: | |
print "No valid subreddit was entered, defaulting to News." | |
for submission in tqdm(get_submissions(search_type, "News")): | |
submissions.append([submission.title, submission.score]) | |
print "Fetched %d submissions" % len(submissions) | |
submissions = pd.DataFrame(submissions, columns = ['titles', 'scores']) | |
# get search terms. Can be user input or the most commonn proper nouns (excluding minor ones) | |
search_term_type = raw_input("Pick your search term type. Options: 'choose own' (default), 'common proper nouns' \n") | |
if search_term_type.lower() == 'common proper nouns': | |
# find all proper nouns | |
stop_words = 'In|The|Man|New|What|My|This|Woman|Best|Why|How|You|Is|Part' | |
all_titles = re.sub(stop_words, "", " ". join(submissions.titles)) # removes useless words from titles. should expand. | |
ProperNouns = re.findall(r'[A-Z][a-z]+', all_titles) | |
# get most common | |
search_terms = pd.Series(ProperNouns).value_counts().nlargest(10).index.values | |
else: | |
# user input search terms | |
search_terms = re.findall(r'((?<=")[A-Za-z].*?(?="))|([A-Za-z]+)', raw_input("Enter all search terms (on one line):\n")) | |
search_terms = [e for l in search_terms for e in l if e] | |
print "your search_terms are: " + ", ".join(search_terms) | |
# get minimum score to show | |
try: | |
min_score = int(raw_input("What is the minimum score for a submission to be included? Defaults to 100.\n")) | |
except: | |
min_score = 100 | |
# subset submissions with applicable score | |
relevant_submissions = submissions[submissions['scores'] >= min_score] | |
# how to display | |
output_type = raw_input("Do you want to group by search term, including duplicates? (y/n)\n defaults to all unique relevant titles.\n") | |
if output_type.lower() == 'y': | |
# output grouped by search term | |
for term in search_terms: | |
print "\n Titles about " + term + ":" | |
print relevant_submissions[relevant_submissions.titles.str.contains("(?i)"+term)] | |
else: | |
# output all unique hits | |
pattern = "(?i)" + "|(?i)".join(search_terms) | |
print relevant_submissions[relevant_submissions.titles.str.contains(pattern)] | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment