Last active
March 22, 2020 17:16
-
-
Save jlroo/80e7ebe12b57a23927a4a26b61ab5210 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Analyzing Google Search Data" | |
author: "Jose Luis Rodriguez" | |
output: | |
html_notebook: default | |
html_document: default | |
date: "August 8, 2018" | |
subtitle: "CME Business Analytics Lab" | |
--- | |
-------------- | |
## About this Notebook | |
-------------- | |
* The google search data on this notebook comes from a google account archive | |
* The steps outlined here to collect and analyze the data may change at any time | |
* Below are the steps to claim your google account data | |
-------------- | |
## Analytics Toolkit: Require Packages | |
-------------- | |
**Install required packages** | |
**Load required packages** | |
* Package: tidyverse, tidytext, lubridate, wordcloud, rvest | |
```{r echo=FALSE} | |
# Here we are checking if the package is installed | |
if(!require("rvest")){ | |
# If the package is not in the system then it will be install | |
install.packages("rvest", dependencies = TRUE) | |
# Here we are loading the package | |
library("rvest") | |
} | |
``` | |
```{r echo=FALSE} | |
if(!require("tidytext")){ | |
install.packages("tidytext", dependencies = TRUE) | |
library("tidytext") | |
} | |
``` | |
```{r echo=FALSE} | |
if(!require("lubridate")){ | |
install.packages("lubridate", dependencies = TRUE) | |
library("lubridate") | |
} | |
``` | |
```{r echo=FALSE} | |
if(!require("wordcloud")){ | |
install.packages("wordcloud", dependencies = TRUE) | |
library("wordcloud") | |
} | |
``` | |
```{r echo=FALSE} | |
if(!require("tidyverse")){ | |
install.packages("tidyverse", dependencies = TRUE) | |
library("tidyverse") | |
} | |
``` | |
-------------- | |
## Data Collection: Claiming your Google Search Data | |
-------------- | |
#### 1) Sign into your google account, then Go to: | |
* https://myaccount.google.com/privacy | |
#### 2) Find the link to download your data archive or Go to: | |
* https://takeout.google.com/settings/takeout | |
#### 3) Select all Google products to create a complete archive of your data | |
#### 4) After selecting the products, choose the file type and max archive size to make sure that all your account data is archive | |
-------------- | |
## Data Preparation: Extracting Google Search Information | |
-------------- | |
#### Locate the Google archive, then find the search data. For this case, it is an html file located in "My Activity" folder inside the "Search" folder the file is named "MyActivity.html" | |
* Takeout/My Activity/Search/MyActivity.html | |
#### Using the rvest package we can read the html document that contains the related google search data | |
```{r} | |
doc <- "Takeout/My Activity/Search/MyActivity.html" | |
search_archive <- read_html(doc) | |
``` | |
-------------- | |
### Laveraging regular expression (regex) we can extract relavant information from the HTML document | |
#### Extract Search Time | |
```{r} | |
date_search <- search_archive %>% | |
html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>% | |
str_extract(pattern = "(?<=<br>)(.*)(?<=PM|AM)") %>% | |
mdy_hms() | |
``` | |
#### Extract Search Text | |
```{r} | |
text_search <- search_archive %>% | |
html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>% | |
str_extract(pattern = '(?<=<a)(.*)(?=</a>)') %>% | |
str_extract(pattern = '(?<=\">)(.*)') | |
``` | |
#### Extract Search Type | |
```{r} | |
type_search <- search_archive %>% | |
html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>% | |
str_extract(pattern = "(?<=mdl-typography--body-1\">)(.*)(?=<a)") %>% | |
str_extract(pattern = "(\\w+)(?=\\s)") | |
``` | |
#### Create a dataframe using the data extracted from the html file | |
```{r} | |
search_data <- tibble(timestamp = date_search, | |
date = as_date(date_search), | |
year = year(date_search), | |
month = month(date_search, label = TRUE), | |
day = weekdays(date_search), | |
hour = hour(date_search), | |
type = type_search, | |
search = text_search) | |
search_data$day <- factor(search_data$day, | |
levels = c("Sunday", "Monday", "Tuesday", | |
"Wednesday","Thursday", "Friday", | |
"Saturday")) | |
search_data <- na.omit(search_data) | |
head(search_data) | |
``` | |
-------------- | |
## Data Analysis: Visualizing Google Searches | |
-------------- | |
#### To get an overall idea of the search volume, we can plot searches by year | |
```{r} | |
p <- ggplot(search_data, aes(year)) | |
p + geom_bar() + labs(x="", title = "Volume of Google Searches Per Year") | |
ggsave("imgs/00-search_volume.png", | |
width = 10, height = 7, | |
dpi = 300, units = "in", device='png') | |
``` | |
### After determine the year/years with the largest search volume we can plot monthly searches for those year/years | |
```{r} | |
monthly <- search_data[(search_data$year > 2014 & search_data$year< 2018), ] | |
tail(monthly)[1:4,] | |
``` | |
```{r} | |
ggplot(monthly) + geom_bar(aes(x = month, group = year)) + | |
theme(axis.text.x = element_text(angle=90)) + | |
facet_grid(.~year, scales="free") + labs(x="", title = "Volume of Google Searches By Month") | |
ggsave("imgs/01-search_volume.png", | |
width = 10, height = 7, | |
dpi = 300, units = "in", device='png') | |
``` | |
#### Another interesting metrict is searches by Hour | |
```{r} | |
p <- ggplot(search_data, aes(hour)) | |
p + geom_bar() + labs(title = "Volume of Google Searches Per Hour") | |
ggsave("imgs/02-search_volume.png", | |
width = 10, height = 7, | |
dpi = 300, units = "in", device='png') | |
``` | |
#### We can also plot the search data by day of the week to determine day are the most active | |
```{r} | |
p <- ggplot(search_data, aes(day)) | |
p + geom_bar() + labs(x="", title = "Volume of Google Searches Per Weekday") | |
ggsave("imgs/03-search_volume.png", | |
width = 10, height = 7, | |
dpi = 300, units = "in", device='png') | |
``` | |
```{r} | |
cnt <- search_data %>% count(day) | |
cnt$percent <- cnt$n/sum(cnt$n) * 100 | |
cnt$zscore <- c(scale(cnt$n)) | |
cnt | |
``` | |
#### We can take it an step further and group search time with day of the week. | |
```{r} | |
ggplot(search_data) + | |
geom_bar(aes(x = hour, group = day) ) + | |
facet_grid(.~day, scales = "free") + | |
labs(title = "Volume of Google Searches Per Weekday/Hour") | |
ggsave("imgs/04-search_volume.png", | |
width = 10, height = 7, | |
dpi = 300, units = "in", device='png') | |
``` | |
#### We can group the search data by year and day of the week, to visualize the overall trends | |
```{r} | |
wkday <- group_by(search_data, year, day) %>% summarize(count = n()) | |
head(wkday) | |
``` | |
```{r} | |
p <- ggplot(wkday, aes(day, count, fill = year)) | |
p + geom_bar(stat = "identity") + labs(x = "", y = "count") | |
ggsave("imgs/05-search_volume.png", | |
width = 11, height = 9, | |
dpi = 300, units = "in", device='png') | |
``` | |
-------------- | |
## Reporting: A Wordcloud from Google Search Data | |
-------------- | |
#### First we need to extract the text and clean it using regular expressions | |
```{r} | |
unnest_reg <- "([^A-Za-z_\\d#@']|'(?![A-Za-z_\\d#@]))" | |
replace_reg <- '(.*.)\\.com(.*.)\\S+\\s|[^[:alnum:]]|(http|https)\\S+\\s*|(#|@)\\S+\\s*|\\n|\\"' | |
``` | |
```{r} | |
#type_visited = "Visited" | |
#type_searched = "Searched" | |
#search_data[search_data$type == type_visited, ]$search | |
search <- search_data$search %>% | |
str_replace_all(pattern = replace_reg, replacement = " ") %>% | |
iconv(from = "ASCII", to = "UTF-8", sub = " ") %>% | |
tolower() %>% | |
trimws() | |
``` | |
```{r} | |
search <- tibble(text = search) %>% | |
unnest_tokens(word, text, token = "regex", pattern = unnest_reg) %>% | |
filter(!word %in% stop_words$word, str_detect(word, "[a-z]")) | |
``` | |
#### Set a threshold for the min/max frequency of words to create the wordcloud | |
```{r} | |
remove_words <- c( "chrome", "searched", "chicago", "jlroo", | |
"google", "loyola", "university", "luc", | |
"business", "analysis", "documentation") | |
my_stop_words <- bind_rows(data_frame(word = remove_words, lexicon = c("SMART")), stop_words) | |
``` | |
```{r} | |
min_freq = 100 | |
fig_scale = c(4 , 0.5) | |
max_words = 100 | |
``` | |
```{r message=FALSE} | |
search %>% | |
anti_join(my_stop_words) %>% | |
count(word) %>% | |
with(wordcloud(word, n, | |
scale = fig_scale, | |
min.freq = min_freq, | |
max.words = max_words)) | |
``` | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment