Created
June 8, 2013 11:26
-
-
Save davidhq/5734878 to your computer and use it in GitHub Desktop.
Traverse and download all books from http://it-ebooks.info
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogiri' | |
require 'open-uri' | |
MARK_FILE = "mark.dat" | |
def wait_for_threads(threads) | |
print "Waiting for downloads to finish..." | |
threads.each { |t| t.join } | |
puts " ok" | |
end | |
def quit(num, threads) | |
if threads.any? | |
wait_for_threads(threads) | |
end | |
File.open(MARK_FILE, 'w') { |f| f.write(num - 1) } | |
exit | |
end | |
EXCLUDE = %w(ActionScript flash flex active_directory) | |
def exclude?(book) | |
for word in EXCLUDE | |
return true if book[:title].gsub('#', 'AAA') =~ /\b#{word.gsub('#', 'AAA').gsub('_', ' ')}\b/i | |
end | |
false | |
end | |
if ARGV.first == 'help' | |
puts "Get all new books: ruby it-ebooks.rb" | |
puts "Get n books from last download: ruby it-ebooks.rb 100" | |
puts "Options: y - download, ENTER - next, d - description, q - quit" | |
puts "(CTRL+C also works for quitting but the current position in not remembered)" | |
exit | |
end | |
puts "Working..." | |
mark = File.exists?(MARK_FILE) ? File.read(MARK_FILE).to_i : 0 | |
max_num = Nokogiri::HTML(open('http://it-ebooks.info/')).css('a').select { |a| a['href'] =~ /\/book\// }.map { |a| a[:href].match(/book\/(\d+)/)[1].to_i }.max | |
latest = ARGV.first ? [mark + ARGV.first.to_i, max_num].min : max_num | |
books = [] | |
nothing_to_do = true | |
for num in (mark + 1).upto(latest) | |
nothing_to_do = false | |
page = open("http://it-ebooks.info/book/#{num}/") | |
if page.base_uri.path == "/404/" | |
puts "#{num} - Not Found" | |
else | |
doc = Nokogiri::HTML(page) | |
div = doc.css("div[itemtype='http://schema.org/Book']") | |
book = {} | |
book[:num] = num | |
book[:title] = div.css("h1").text | |
book[:subtitle] = div.css("h3").text | |
book[:publisher] = doc.css("a[itemprop='publisher']").text | |
book[:description] = doc.css("span[itemprop='description']").text | |
book[:year] = doc.css("b[itemprop='datePublished']").text | |
book[:link] = "http://it-ebooks.info#{doc.css('a#dl').first['href']}" | |
if exclude?(book) | |
puts "Excluding #{num}: #{book[:title]} (#{book[:year]})" | |
else | |
puts "#{num} (#{num - mark}/#{latest - mark})" | |
books << book | |
end | |
end | |
end | |
puts | |
threads = [] | |
books.each_with_index do |book, index| | |
puts "#{book[:num]} (#{index + 1}/#{books.count})" | |
puts book[:title] | |
puts book[:subtitle] unless book[:subtitle].empty? | |
puts "#{book[:publisher]}, #{book[:year]}" | |
input = STDIN.gets.chomp.downcase | |
quit(book[:num], threads) if input == "q" | |
if input == "d" | |
puts book[:description] | |
input = STDIN.gets.chomp.downcase | |
quit(book[:num], threads) if input == "q" | |
end | |
if input == "y" | |
threads << Thread.new do | |
begin | |
f = open(book[:link]) | |
cd = f.meta['content-disposition'] | |
if cd.nil? # content-disposition was missing, try again | |
f = open(book[:link]) | |
cd = f.meta['content-disposition'] | |
end | |
filename = cd.match(/filename=(\"?)(.+)\1/)[2] | |
if File.exists?(filename) | |
puts "#{filename} already exists!" | |
else | |
File.open(filename, "wb") do |file| | |
file.write f.read | |
end | |
end | |
rescue => e | |
puts | |
puts "### Error downloading book #{book[:num]}: #{e.message} ###" | |
end | |
end | |
puts | |
end | |
end | |
if threads.any? | |
wait_for_threads(threads) | |
elsif nothing_to_do | |
puts "No new books." | |
end | |
File.open(MARK_FILE, 'w') { |f| f.write(latest) } |
As of recently, the website has changed their system for new books. Do you know anyway around it?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Does it works?