Created
June 11, 2011 14:17
-
-
Save archiloque/1020589 to your computer and use it in GitHub Desktop.
Scrapper les données d'allocine.fr
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: UTF-8 | |
# ce script recupère la liste des films en utilisant les pages listant les sorties par mois | |
require_relative '../allocine_models' | |
require 'nokogiri' | |
require 'typhoeus' | |
# nombre de mois de données à télécharger: 120 mois = 10 ans de films | |
month_number = 120 | |
# pour un mois donné créé la requête qui va lister les films sortis | |
def create_request month_date | |
# l'url de la page à scrapper | |
page_url = "http://www.allocine.fr/film/agenda_mois.html?month=#{month_date.strftime("%Y-%m")}" | |
# on créé la requête | |
request = Typhoeus::Request.new page_url | |
# quand on a récupéré la page | |
request.on_complete do |response| | |
# on décode le contenu pour y accéder plus facilement | |
doc = Nokogiri::HTML(response.body, page_url, 'UTF-8') | |
# on cherche les blocs qui correspondent aux jours de sortie | |
doc.search('.vmargin20b').each do |liste| | |
# le jour de sortie | |
day_in_month = liste.parent.search('h2')[0].text[0...2].strip.to_i | |
# date invalide, ça arrive sur certaines pages | |
unless day_in_month == 0 | |
# la date de sortie | |
actual_date = Date.new(month_date.year, month_date.month, day_in_month) | |
# pour chaque film de la liste | |
liste.search('.bold a').each do |f| | |
# le titre du film est le contenu du lien | |
title = f.text.strip | |
# l'url du film | |
href = f[:href] | |
# l'id du film qu'on extrait de l'url | |
film_id = href[(href.index('=')+1)..(href.index('.') - 1)].to_i | |
# si le film n'existe pas (pour les sorties multiples), on le créé | |
unless Film.first(:id => film_id) | |
film = Film.new | |
film.id = film_id | |
film.title = title | |
film.pub_date = actual_date | |
film.save | |
end | |
end | |
end | |
end | |
end | |
# on renvoie la requête | |
request | |
end | |
hydra = Typhoeus::Hydra.new | |
# la date actuelle qui sert de point de départ | |
current_date = Date.today | |
# pour chaque mois on créé la requête | |
while month_number > 0 | |
hydra.queue create_request(current_date) | |
current_date = current_date << 1 | |
month_number -= 1 | |
end | |
# on execute les requêtes et on attend que ça se termine | |
hydra.run |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: UTF-8 | |
require 'sequel' | |
# crée les tables pour stocker les données | |
Sequel.migration do | |
change do | |
# les films | |
create_table(:films) do | |
primary_key :id, :type => Bignum, :unsigned => true | |
String :title, :null => false | |
# la date de sortie | |
Date :pub_date, :null => false, :index => true | |
# indique si on a scrappé le film | |
Boolean :scrapped, :index => true, :default => false | |
# les notes moyennes de la presse et du public | |
Float :average_press, :null => true | |
Float :average_people, :null => true | |
end | |
# les genres | |
create_table(:genres) do | |
primary_key :id, :type => Bignum, :unsigned => true | |
String :name, :null => false, :unique => true, :index => true | |
end | |
create_table(:films_genres) do | |
foreign_key :film_id, :films, :index => true, :null => false | |
foreign_key :genre_id, :genres, :index => true, :null => false | |
end | |
# les pays | |
create_table(:countries) do | |
primary_key :id, :type => Bignum, :unsigned => true | |
String :name, :null => false, :unique => false, :index => true | |
end | |
create_table(:countries_films) do | |
foreign_key :film_id, :films, :type => Bignum, :unsigned => true | |
foreign_key :country_id, :countries, :type => Bignum, :unsigned => true | |
end | |
# les réalisateurs | |
create_table(:directors) do | |
primary_key :id, :type => Bignum, :unsigned => true | |
String :name, :null => false, :unique => false, :index => true | |
end | |
create_table(:directors_films) do | |
foreign_key :film_id, :films, :type => Bignum, :unsigned => true | |
foreign_key :director_id, :directors, :type => Bignum, :unsigned => true | |
end | |
# les tags | |
create_table(:tags) do | |
primary_key :id, :type => Bignum, :unsigned => true | |
String :name, :null => false, :unique => false, :index => true | |
end | |
create_table(:films_tags) do | |
foreign_key :film_id, :films, :type => Bignum, :unsigned => true | |
foreign_key :tag_id, :tags, :type => Bignum, :unsigned => true | |
end | |
# les critiques | |
create_table(:critics) do | |
primary_key :id | |
String :name, :null => false, :unique => true, :index => true | |
end | |
create_table(:grades) do | |
foreign_key :film_id, :films, :type => Bignum, :unsigned => true | |
foreign_key :critic_id, :critics, :type => Bignum, :unsigned => true | |
Fixnum :value, :null => false, :unsigned => true, :index => true | |
end | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: UTF-8 | |
# ce script recupère le contenu des films qui ont été listés dans le script précédent | |
require_relative '../allocine_models' | |
require 'nokogiri' | |
require 'typhoeus' | |
# récupère une id à partir d'une url de la forme /film/tous/genre-13023/ | |
def get_id2 href | |
href[(href.index('-') + 1)..(href.index('/') - 1)].to_i | |
end | |
# les noms de tag sont de la forme "sportifs au cinéma (98)" avec le nombre d'occurences entre parenthèses | |
# donc on ne veut prendre que le contenu | |
TAG_NAME_REGEX = /(.*) \(\d+\)/ | |
# créé une requête pour un film | |
def create_request film | |
page_url = "http://www.allocine.fr/film/fichefilm_gen_cfilm=#{film.id}.html" | |
# on créé la requête | |
request = Typhoeus::Request.new page_url | |
# quand on a récupéré la page | |
request.on_complete do |response| | |
# on décode le contenu pour y accéder plus facilement | |
doc = Nokogiri::HTML(response.body, page_url, 'UTF-8') | |
# le(s) réalisateur(s) | |
doc.css("a[@rel='v:directedBy']").each do |director_node| | |
director_href = director_node[:href] | |
# l'id de la page du réalisateur | |
director_id = director_href[(director_href.index('=') + 1)..(director_href.rindex('.') - 1)].to_i | |
# on cherche le réalisateur par l'id, si on ne le trouve pas on le créé | |
director = Director.find(:id => director_id) | |
unless director | |
director = Director.new | |
director.id = director_id | |
director.name = director_node.text | |
director.save | |
end | |
# on ajoute le réalisateur au film | |
film.add_director(director) | |
end | |
# le(s) pays(s) | |
doc.xpath("//a[starts-with(@href, '/film/tous/pays-')]").each do |country_node| | |
# l'id | |
country_id = get_id2(country_node[:href]) | |
# on cherche le pays par l'id, si on ne le trouve pas, on le créé | |
country = Country.find(:id => country_id) | |
unless country | |
country = Country.new | |
country.id = country_id | |
country.name = country_node.text | |
country.save | |
end | |
# on ajoute le pays au film | |
film.add_country(country) | |
end | |
# le(s) genre(s) | |
doc.xpath("//a[starts-with(@href, '/film/tous/genre-')]").each do |genre_node| | |
genre_id = get_id2(genre_node[:href]) | |
genre = Genre.find(:id => genre_id) | |
unless genre | |
genre = Genre.new | |
genre.id = genre_id | |
genre.name = genre_node.text | |
genre.save | |
end | |
film.add_genre(genre) | |
end | |
# le(s) tag(s) | |
doc.xpath("//a[starts-with(@href, '/tags/tag-')]").each do |tag_node| | |
tag_id = get_id2(tag_node[:href]) | |
tag = Tag.find(:id => tag_id) | |
unless tag | |
tag = Tag.new | |
tag.id = tag_id | |
tag.name = tag_node.text.strip | |
# on ne récupère que le contenu du tag | |
if m = TAG_NAME_REGEX.match tag.name | |
tag.name = m[1] | |
end | |
tag.save | |
end | |
film.add_tag tag | |
end | |
# la note moyenne de la presse (pas toujours présente) | |
possible_press_average = doc.at("//a[starts-with(@href, '/film/revuedepresse_gen_cfilm=')]") | |
if possible_press_average | |
average_press_text = possible_press_average.parent.parent.text | |
film.average_press = average_press_text[(average_press_text.index('(') + 1)..(average_press_text.index(')') - 1)].gsub(',', '.').to_f | |
end | |
# les notes de la presse | |
doc.xpath("//a[starts-with(@href, '/film/revuedepresse_gen_cfilm=#{film.id}.html#pressreview')]").each do |critic_node| | |
critic = Critic.find_or_create(:name => critic_node.text) | |
# pour la note on récupère le titre de l'image | |
grade = critic_node.parent.parent.children[2].search('img')[0][:title].to_i | |
Grade.create(:critic => critic, :film => film, :value => grade) | |
end | |
# la note moyenne des spectateurs (pas toujours présente) | |
possible_spectators_average = doc.at("//a[starts-with(@href, '/film/critiquepublic_gen_cfilm=')]") | |
if possible_spectators_average | |
average_people_text = possible_spectators_average.parent.parent.text | |
if average_people_text.index('(') | |
film.average_people = average_people_text[(average_people_text.rindex('(') + 1)..(average_people_text.rindex(')') - 1)].gsub(',', '.').to_f | |
end | |
end | |
# on indique qu'on a scrappé le film | |
film.scrapped = true | |
# on le sauvegarde | |
film.save | |
end | |
request | |
end | |
hydra = Typhoeus::Hydra.new | |
# on créé une requête pour chaque film par encore scrappé | |
Film.where(:scrapped => false).each do |film| | |
hydra.queue create_request(film) | |
end | |
# on execute les requêtes et on attend que ça se termine | |
hydra.run |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: UTF-8 | |
# certains tags réalisateurs et pays sont dupliqués -> on va supprimer les doublons en ne conservant que le premier de chacun d'entre eux | |
require_relative '../allocine_models' | |
DB.fetch('select count(*) c, min(id) i, name n from tags group by name having c > 1') do |row| | |
p "Tag #{row[:n]} est dupliqué" | |
DB['update films_tags set tag_id = ? where tag_id in (select id from tags where name = ? and id != ?)', row[:i], row[:n], row[:i]] | |
DB['delete from tags where name = ? and id != ?', row[:n], row[:i]] | |
end | |
DB.fetch('select count(*) c, min(id) i, name n from directors group by name having c > 1') do |row| | |
p "Réalisateur #{row[:n]} est dupliqué" | |
DB['update directors_films set director_id = ? where director_id in (select id from directors where name = ? and id != ?)', row[:i], row[:n], row[:i]] | |
DB['delete from directors where name = ? and id != ?', row[:n], row[:i]] | |
end | |
DB.fetch('select count(*) c, min(id) i, name n from countries group by name having c > 1') do |row| | |
p "Pays #{row[:n]} est dupliqué" | |
DB['update countries_films set country_id = ? where country_id in (select id from countries where name = ? and id != ?)', row[:i], row[:n], row[:i]] | |
DB['delete from countries where name = ? and id != ?', row[:n], row[:i]] | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: UTF-8 | |
# le modèle de données | |
require 'sequel' | |
# la connexion à la base (on va utiliser une base sqlite locale) | |
DB = Sequel.connect('sqlite://allocine.sqlite') | |
# country est irrégulier | |
Sequel.inflections do |inflect| | |
inflect.irregular 'country', 'countries' | |
end | |
# les films | |
class Film < Sequel::Model | |
many_to_many :tags | |
many_to_many :genres | |
one_to_many :grades | |
many_to_many :directors | |
many_to_many :countries | |
end | |
# les genres | |
class Genre < Sequel::Model | |
many_to_many :films | |
end | |
# les pays | |
class Country < Sequel::Model | |
many_to_many :films | |
end | |
# les réalisateurs | |
class Director < Sequel::Model | |
many_to_many :films | |
end | |
# les tags | |
class Tag < Sequel::Model | |
many_to_many :films | |
end | |
# les critiques | |
class Critic < Sequel::Model | |
one_to_many :grades | |
end | |
# les notes | |
class Grade < Sequel::Model | |
many_to_one :film | |
many_to_one :critic | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment