Skip to content

Instantly share code, notes, and snippets.

@FSund
Created May 1, 2019 06:29
Show Gist options
  • Save FSund/66c05796868935051fc9629395856508 to your computer and use it in GitHub Desktop.
Save FSund/66c05796868935051fc9629395856508 to your computer and use it in GitHub Desktop.
Novelkeys/Gateron Ink web scraper
# sources:
# https://www.adventuresintechland.com/detect-when-a-webpage-changes-with-python/
# https://chrisalbon.com/python/web_scraping/monitor_a_website/
import hashlib
import urllib # use built-in urllib
import random
import time
from bs4 import BeautifulSoup as soup
from pushbullet import Pushbullet
# url to be scraped
url = "https://novelkeys.xyz/collections/switches/products/gateron-ink-switches"
# time between checks in seconds
# sleeptime = 60 # every minute
sleeptime = 60*60 # every hour
def _getUserAgent():
# random integer to select user agent
randomint = random.randint(0, 7)
# User_Agents
# This helps skirt a bit around servers that detect repeaded requests from the same machine.
# This will not prevent your IP from getting banned but will help a bit by pretending to be different browsers
# and operating systems.
user_agents = [
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (Windows NT 5.1) AppleWbepython Kit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.142 Safari/535.19',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:8.0.1) Gecko/20100101 Firefox/8.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.151 Safari/535.19'
]
return user_agents[randomint]
def _getSoup():
user_agent = _getUserAgent()
print("user agent:\n %s" % user_agent)
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', user_agent)]
response = opener.open(url, timeout=30) # 30 s timeout
# the_page = response.read()
# return the_page
web_soup = soup(response, features="html.parser")
return web_soup
def _doTheThing(pb):
try:
web_soup = _getSoup()
except (TimeoutError, urllib.error.URLError, TimeoutError) as err:
print("Got known error, which will be ignored")
print(err)
return # ignore timeout error, try again
soldout = web_soup.find(name="div", attrs={'class': 'swatch-element silent-ink soldout'})
available = web_soup.find(name="div", attrs={'class': 'swatch-element silent-ink available'})
if available or not soldout:
print("Gateron Inks ARE AVAILABLE")
push = pb.push_link("GATERON INKS ARE AVAILABLE", url)
return
else:
print("Gateron Inks are not available")
# push = pb.push_note("Not available", "Sorry.")
def doTheThingAndSleep(pb):
_doTheThing(pb)
# sleep a random time
factor = random.uniform(0.9, 1.1) # factor between 0.8 and 1.2
print("Sleeping for %d seconds" % (sleeptime*factor))
time.sleep(sleeptime*factor)
api_key = "Insert your Pushbullet API key here"
pb = Pushbullet(api_key)
while True: # Run forever
try:
doTheThingAndSleep(pb)
except Exception as err:
push = pb.push_note("Your script crashed", str(err))
raise err
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment