stevedya · May 5, 2020 23:40
diff --git a/tidytext.R b/tidytext.R
 library(dplyr)
 library(tidytext)
 library(ggplot2)

 data <- read.csv('/Users/steven/Desktop/JEOPARDY_CSV.csv')
 text_set <- iconv(data$Question, to = "utf-8-mac")

 clean_text <- text_set %>%
  as_tibble() %>%
  unnest_tokens(word, value,  to_lower = TRUE, strip_punct = TRUE, strip_numeric = TRUE) %>%
  mutate(word = str_replace(word, "word-to-replace", "replacement-word")) %>%
  anti_join(stop_words) 
  
  # For multiple replacement words use this 
  # mutate(word = if_else(word %in% c("emailing", "emails"), "email", word))
  
  # For single replacement use this
  # mutate(word = str_replace(word, "word-to-replace", "replacement-word")) %>%
  
 # Bing Word Counts to print out do print(bing_word_counts)
 bing_word_counts <- clean_text %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

 # Use bing word counts to make a graph
 bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()


 #  Notes 
 # # custom stop words, to be removed from analysis
 # custom_stop_words <- tibble(word = c("trump","critical","issues","issue"))
 # 
 # # To remove a list of extra stop_words add this to the first method
 #   anti_join(custom_stop_words, by = "word")
	library(dplyr)
	library(tidytext)
	library(ggplot2)

	data <- read.csv('/Users/steven/Desktop/JEOPARDY_CSV.csv')
	text_set <- iconv(data$Question, to = "utf-8-mac")

	clean_text <- text_set %>%
	as_tibble() %>%
	unnest_tokens(word, value, to_lower = TRUE, strip_punct = TRUE, strip_numeric = TRUE) %>%
	mutate(word = str_replace(word, "word-to-replace", "replacement-word")) %>%
	anti_join(stop_words)

	# For multiple replacement words use this
	# mutate(word = if_else(word %in% c("emailing", "emails"), "email", word))

	# For single replacement use this
	# mutate(word = str_replace(word, "word-to-replace", "replacement-word")) %>%

	# Bing Word Counts to print out do print(bing_word_counts)
	bing_word_counts <- clean_text %>%
	inner_join(get_sentiments("bing")) %>%
	count(word, sentiment, sort = TRUE) %>%
	ungroup()

	# Use bing word counts to make a graph
	bing_word_counts %>%
	group_by(sentiment) %>%
	top_n(10) %>%
	ungroup() %>%
	mutate(word = reorder(word, n)) %>%
	ggplot(aes(word, n, fill = sentiment)) +
	geom_col(show.legend = FALSE) +
	facet_wrap(~sentiment, scales = "free_y") +
	labs(y = "Contribution to sentiment",
	x = NULL) +
	coord_flip()


	# Notes
	# # custom stop words, to be removed from analysis
	# custom_stop_words <- tibble(word = c("trump","critical","issues","issue"))
	#
	# # To remove a list of extra stop_words add this to the first method
	# anti_join(custom_stop_words, by = "word")