chrisle · December 25, 2012 16:18
diff --git a/escaped_fragment.rb b/escaped_fragment.rb
 # Looks for the escaped fragment meta tag.  If found, gets the HTML snapshot
 # instead

 module GoogleBotSimulator::EscapedFragment

  def has_meta_fragment?
    (@response.search('//meta[@name="fragment"]/@content').to_s == '!') ? true : false
  end

  def url_with_escaped_fragment(url)
    url = url + "?_escaped_fragment_="
  end

 end
diff --git a/google_bot_simulator.rb b/google_bot_simulator.rb
 require 'googlebot_simulator/escaped_fragment'
 require 'googlebot_simulator/tags'

 # Takes a URL and does various tests on it.
 # 
 # === Example
 # 
 #   sim_google = GoogleBotSimulator.new('https://www.coursera.org/')
 #   
 #   # Check if the page has an HTML snapshot (aka hashed fragment)
 #   sim_google.has_meta_fragment?  # => true
 #
 #   # Get the HTML page. If it has an HTML snapshot, get that instead.   
 #   html = sim_google.fetch_page
 #   
 class GooglebotSimulator
  include GoogleBotSimulator::EscapedFragment
  include GoogleBotSimulator::Tags
  include GoogleBotSimulator::Header
  include GoogleBotSimulator::Links
  include GoogleBotSimulator::Images
  include GoogleBotSimulator::Detectors
  include GoogleBotSimulator::AsHtml
  include GoogleBotSimulator::Content

  attr_reader :response, :url, :agent

  VERSION = '1.0.0'
  GOOGLE_BOT = 'Googlebot/2.1 (+http://www.google.com/bot.html)'

  def initialize(url)
    @agent = Mechanize.new
    @agent.robots = true                  # follow robots.txt
    @agent.follow_meta_refresh = false    # dont follow meta refresh
    @agent.redirect_ok = false            # dont follow redirects
    @agent.user_agent = GOOGLE_BOT        # set user agent to googlebot
    @url = add_trailing_slash(url)
    fetch_page
  end

  def add_trailing_slash(url)
    url + '/' if url.last != '/'
  end

  def no_trailing_slash(url)
    url.gsub(/\/$/, '')
  end

  def fetch_page
    Rails.logger.debug "Fetching: #{@url}"
    @response = @agent.get(@url)

    if has_meta_fragment?
      Rails.logger.debug "Escaped fragment meta tag detected."
      @url = url_with_escaped_fragment(no_trailing_slash(@url))
      @response = @agent.get(@url)
    end

  rescue Mechanize::ResponseReadError => e
    e.force_parse
  end

  # Returns the real mechanize object
  def to_mechanize
    @agent
  end

  # Returns the page's nokogiri object
  def to_nokogiri
    @agent.page.at('/')
  end

  def domain_url
    uri = URI(@url)
    "#{uri.scheme}://#{uri.host}"
  end

 end
	# Looks for the escaped fragment meta tag. If found, gets the HTML snapshot
	# instead

	module GoogleBotSimulator::EscapedFragment

	def has_meta_fragment?
	(@response.search('//meta[@name="fragment"]/@content').to_s == '!') ? true : false
	end

	def url_with_escaped_fragment(url)
	url = url + "?_escaped_fragment_="
	end

	end
	require 'googlebot_simulator/escaped_fragment'
	require 'googlebot_simulator/tags'

	# Takes a URL and does various tests on it.
	#
	# === Example
	#
	# sim_google = GoogleBotSimulator.new('https://www.coursera.org/')
	#
	# # Check if the page has an HTML snapshot (aka hashed fragment)
	# sim_google.has_meta_fragment? # => true
	#
	# # Get the HTML page. If it has an HTML snapshot, get that instead.
	# html = sim_google.fetch_page
	#
	class GooglebotSimulator
	include GoogleBotSimulator::EscapedFragment
	include GoogleBotSimulator::Tags
	include GoogleBotSimulator::Header
	include GoogleBotSimulator::Links
	include GoogleBotSimulator::Images
	include GoogleBotSimulator::Detectors
	include GoogleBotSimulator::AsHtml
	include GoogleBotSimulator::Content

	attr_reader :response, :url, :agent

	VERSION = '1.0.0'
	GOOGLE_BOT = 'Googlebot/2.1 (+http://www.google.com/bot.html)'

	def initialize(url)
	@agent = Mechanize.new
	@agent.robots = true # follow robots.txt
	@agent.follow_meta_refresh = false # dont follow meta refresh
	@agent.redirect_ok = false # dont follow redirects
	@agent.user_agent = GOOGLE_BOT # set user agent to googlebot
	@url = add_trailing_slash(url)
	fetch_page
	end

	def add_trailing_slash(url)
	url + '/' if url.last != '/'
	end

	def no_trailing_slash(url)
	url.gsub(/\/$/, '')
	end

	def fetch_page
	Rails.logger.debug "Fetching: #{@url}"
	@response = @agent.get(@url)

	if has_meta_fragment?
	Rails.logger.debug "Escaped fragment meta tag detected."
	@url = url_with_escaped_fragment(no_trailing_slash(@url))
	@response = @agent.get(@url)
	end

	rescue Mechanize::ResponseReadError => e
	e.force_parse
	end

	# Returns the real mechanize object
	def to_mechanize
	@agent
	end

	# Returns the page's nokogiri object
	def to_nokogiri
	@agent.page.at('/')
	end

	def domain_url
	uri = URI(@url)
	"#{uri.scheme}://#{uri.host}"
	end

	end