-
-
Save bjhomer/4489273 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'rubygems' | |
require 'mechanize' | |
$name_mapping = { | |
"Egwene al'Vere" => "Egwene", | |
"Elaida a'Roihan" => "Elaida", | |
"Verin Mathwin" => "Verin", | |
"Cadsuane Melaidhrin" => "Cadsuane", | |
"Egeanin Sarna" => "Egeanin", | |
"Egeanin Tamarath" => "Egeanin", | |
"Isam" => "Isam/Luc", | |
"Morgase Trakand" => "Morgase", | |
"Sheriam Bayanar" => "Sheriam", | |
"Fortuona" => "Tuon", | |
"Ituralde" => "Rodel Ituralde", | |
"Alviarin Freidhen" => "Alviarin", | |
"Galina" => "Galina Casban", | |
"Siuan" => "Siuan Sanche", | |
"Egwene POV" => "Egwene", | |
"Galad Damodred" => "Galad", | |
"High Lady Suroth Sabelle Meldarath" => "Suroth", | |
"Leane Sharif" => "Leane", | |
"Romanda Cassin" => "Romanda", | |
"Thom" => "Thom Merrilin" | |
} | |
# Utility methods | |
def merge_povs(first_pov, second_pov) | |
second_pov.each do |k, v| | |
sum = [first_pov[k].to_i, second_pov[k].to_i].inject(:+) | |
first_pov[k] = sum | |
end | |
return first_pov | |
end | |
# Scraper methods | |
def get_chapter_povs(agent, chapter_url) | |
puts " Fetching #{chapter_url}..." | |
chapter = agent.get(chapter_url) | |
povs = chapter.search("p b") | |
chapter_povs = {} | |
povs.each do |pov| | |
pov = pov.text.strip | |
if $name_mapping[pov] | |
pov = $name_mapping[pov] | |
end | |
if chapter_povs[pov] | |
chapter_povs[pov] = chapter_povs[pov] + 1 | |
else | |
chapter_povs[pov] = 1 | |
end | |
end | |
return chapter_povs | |
end | |
def get_book_pov(agent, starting_page, book_url) | |
puts "Fetching #{book_url}..." | |
book_povs = {} | |
book = agent.get(starting_page + "/" + book_url) | |
# Scrape chapter URLs | |
chapters = book.search("ol li a").map do |ch_src| | |
ch_src.attributes["href"].value | |
end | |
# Add prologues and friends | |
book.search("ul li a").each do |extra| | |
chapters << extra.attributes["href"].value | |
end | |
chapters.each do |chapter_url| | |
povs = get_chapter_povs(agent, chapter_url) rescue {} | |
merge_povs(book_povs, povs) | |
end | |
return book_povs | |
end | |
# The real business: scraping books for POVs | |
def main | |
agent = Mechanize.new | |
starting_page = "http://encyclopaedia-wot.org" | |
page = agent.get(starting_page) | |
books = page.search('ol li a').map do |book_src| | |
book_src.attributes["href"].value | |
end | |
final_povs = {} | |
books.each do |book| | |
povs = get_book_pov(agent, starting_page, book) | |
puts "-- #{book}" | |
pp povs | |
merge_povs(final_povs, povs) | |
end | |
sorted_povs = final_povs.sort_by {|k,v| [-v, k]} | |
puts sorted_povs.map {|x| "%20s => %d" % [x[0], x[1]]} | |
end | |
main |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment