Last active
February 7, 2022 13:58
-
-
Save arshamalh/d0dcf968544565bd1667cd1cd65b5598 to your computer and use it in GitHub Desktop.
Running Scrapy spider along fastAPI or APScheduler, from another script or main.py.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy import crawler | |
from scrapy.utils.project import get_project_settings | |
# pip install crochet | |
from crochet import setup as crochet_setup, run_in_reactor | |
crochet_setup() | |
runner = crawler.CrawlerRunner(get_project_settings()) | |
@run_in_reactor | |
def getMovieInfo(): | |
deferred = runner.crawl("spider_name") | |
return deferred | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This file is a part a hint related to along_apscheduler_and_more_options.py file | |
# Helping us to pass custome arguments to our file whenever we are calling the spider from another script. | |
import scrapy | |
class SpiderNameSpider(scrapy.Spider): | |
name = "spider_name" | |
def __init__(self, **kwargs): | |
super(SpiderNameSpider, self).__init__(**kwargs) | |
self.custom_params = kwargs.get("custom_arguments") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime | |
from pytz import utc | |
from scrapy import crawler | |
from scrapy.utils.project import get_project_settings | |
from crochet import setup as crochet_setup, run_in_reactor | |
from apscheduler.schedulers.blocking import BlockingScheduler | |
crochet_setup() | |
runner = crawler.CrawlerRunner(get_project_settings()) | |
schedule = BlockingScheduler(timezone=utc) | |
def doSomethingAfterScrape(): | |
# This is the callback of scrappy crawler and will be called whenever scrapping finish. | |
print("Scrappying finished.") | |
@run_in_reactor | |
def DoSomeScrapy(): | |
# Custom arguments must be implemented in the spider class | |
deferred = runner.crawl("spider_name", custom_arguments="custom_values") | |
deferred.addCallback(doSomethingAfterScrape) | |
return deferred | |
schedule.add_job( | |
DoSomeScrapy, | |
'interval', | |
next_run_time=datetime.now(tz=utc), # Optional argument. | |
jitter=120, # Jitter may be need to randomize scrappying, it's 120 seconds. | |
seconds=1000 | |
) | |
schedule.start() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment