Created
December 25, 2012 16:18
-
-
Save chrisle/4374021 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Looks for the escaped fragment meta tag. If found, gets the HTML snapshot | |
# instead | |
module GoogleBotSimulator::EscapedFragment | |
def has_meta_fragment? | |
(@response.search('//meta[@name="fragment"]/@content').to_s == '!') ? true : false | |
end | |
def url_with_escaped_fragment(url) | |
url = url + "?_escaped_fragment_=" | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'googlebot_simulator/escaped_fragment' | |
require 'googlebot_simulator/tags' | |
# Takes a URL and does various tests on it. | |
# | |
# === Example | |
# | |
# sim_google = GoogleBotSimulator.new('https://www.coursera.org/') | |
# | |
# # Check if the page has an HTML snapshot (aka hashed fragment) | |
# sim_google.has_meta_fragment? # => true | |
# | |
# # Get the HTML page. If it has an HTML snapshot, get that instead. | |
# html = sim_google.fetch_page | |
# | |
class GooglebotSimulator | |
include GoogleBotSimulator::EscapedFragment | |
include GoogleBotSimulator::Tags | |
include GoogleBotSimulator::Header | |
include GoogleBotSimulator::Links | |
include GoogleBotSimulator::Images | |
include GoogleBotSimulator::Detectors | |
include GoogleBotSimulator::AsHtml | |
include GoogleBotSimulator::Content | |
attr_reader :response, :url, :agent | |
VERSION = '1.0.0' | |
GOOGLE_BOT = 'Googlebot/2.1 (+http://www.google.com/bot.html)' | |
def initialize(url) | |
@agent = Mechanize.new | |
@agent.robots = true # follow robots.txt | |
@agent.follow_meta_refresh = false # dont follow meta refresh | |
@agent.redirect_ok = false # dont follow redirects | |
@agent.user_agent = GOOGLE_BOT # set user agent to googlebot | |
@url = add_trailing_slash(url) | |
fetch_page | |
end | |
def add_trailing_slash(url) | |
url + '/' if url.last != '/' | |
end | |
def no_trailing_slash(url) | |
url.gsub(/\/$/, '') | |
end | |
def fetch_page | |
Rails.logger.debug "Fetching: #{@url}" | |
@response = @agent.get(@url) | |
if has_meta_fragment? | |
Rails.logger.debug "Escaped fragment meta tag detected." | |
@url = url_with_escaped_fragment(no_trailing_slash(@url)) | |
@response = @agent.get(@url) | |
end | |
rescue Mechanize::ResponseReadError => e | |
e.force_parse | |
end | |
# Returns the real mechanize object | |
def to_mechanize | |
@agent | |
end | |
# Returns the page's nokogiri object | |
def to_nokogiri | |
@agent.page.at('/') | |
end | |
def domain_url | |
uri = URI(@url) | |
"#{uri.scheme}://#{uri.host}" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment