Skip to content

Instantly share code, notes, and snippets.

@arshamalh
Last active February 7, 2022 13:58
Show Gist options
  • Save arshamalh/d0dcf968544565bd1667cd1cd65b5598 to your computer and use it in GitHub Desktop.
Save arshamalh/d0dcf968544565bd1667cd1cd65b5598 to your computer and use it in GitHub Desktop.
Running Scrapy spider along fastAPI or APScheduler, from another script or main.py.
from scrapy import crawler
from scrapy.utils.project import get_project_settings
# pip install crochet
from crochet import setup as crochet_setup, run_in_reactor
crochet_setup()
runner = crawler.CrawlerRunner(get_project_settings())
@run_in_reactor
def getMovieInfo():
deferred = runner.crawl("spider_name")
return deferred
# This file is a part a hint related to along_apscheduler_and_more_options.py file
# Helping us to pass custome arguments to our file whenever we are calling the spider from another script.
import scrapy
class SpiderNameSpider(scrapy.Spider):
name = "spider_name"
def __init__(self, **kwargs):
super(SpiderNameSpider, self).__init__(**kwargs)
self.custom_params = kwargs.get("custom_arguments")
from datetime import datetime
from pytz import utc
from scrapy import crawler
from scrapy.utils.project import get_project_settings
from crochet import setup as crochet_setup, run_in_reactor
from apscheduler.schedulers.blocking import BlockingScheduler
crochet_setup()
runner = crawler.CrawlerRunner(get_project_settings())
schedule = BlockingScheduler(timezone=utc)
def doSomethingAfterScrape():
# This is the callback of scrappy crawler and will be called whenever scrapping finish.
print("Scrappying finished.")
@run_in_reactor
def DoSomeScrapy():
# Custom arguments must be implemented in the spider class
deferred = runner.crawl("spider_name", custom_arguments="custom_values")
deferred.addCallback(doSomethingAfterScrape)
return deferred
schedule.add_job(
DoSomeScrapy,
'interval',
next_run_time=datetime.now(tz=utc), # Optional argument.
jitter=120, # Jitter may be need to randomize scrappying, it's 120 seconds.
seconds=1000
)
schedule.start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment