Skip to content

Instantly share code, notes, and snippets.

@jdhoek
Created January 31, 2025 09:24
Show Gist options
  • Save jdhoek/0a9f04b1b70ba7e18c08cfb5c42b94f9 to your computer and use it in GitHub Desktop.
Save jdhoek/0a9f04b1b70ba7e18c08cfb5c42b94f9 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import sys
import http.client
import re
from lxml import etree
from unidecode import unidecode
from datetime import datetime
OP_HOURS_REGEX = re.compile('(Ma|Di|Wo|Do|Vr|Za|Zo)\s+([0-9]{2}:[0-9]{2}-[0-9]{2}:[0-9]{2})')
TODAY = datetime.today().strftime('%Y-%m-%d')
def show_help():
print("{} analyze INPUT".format(sys.argv[0]))
print("{} convert INPUT > OUTPUT".format(sys.argv[0]))
print("{} openinghours URL".format(sys.argv[0]))
def parse_command():
if len(sys.argv) < 3:
show_help()
sys.exit(0)
if sys.argv[1] == "analyze":
analyze(sys.argv[2])
elif sys.argv[1] == "convert":
convert(sys.argv[2])
elif sys.argv[1] == "openinghours":
print(openinghours(sys.argv[2]))
else:
show_help()
sys.exit(2)
def analyze(file_name):
error_objects = {}
with open(file_name) as input_file:
tree = etree.parse(input_file)
root = tree.getroot()
for element in root:
id = element.get('id')
osm_type = element.tag
if is_lidl(element):
required = ['addr:street', 'addr:postcode', 'addr:housenumber', 'addr:city']
for tag in element:
key = tag.get('k')
if key in required:
required.remove(key)
if len(required) > 0:
if osm_type not in error_objects:
error_objects[osm_type] = []
error_objects[osm_type].append(id)
print(f'Error in {id}, missing: {required}')
for osm_type, ids in error_objects.items():
print(osm_type + ': ' + ', '.join(ids))
def convert(file_name):
with open(file_name) as input_file:
tree = etree.parse(input_file)
root = tree.getroot()
for element in root:
if is_lidl(element):
fix_tags(element)
xml = etree.tostring(tree,
encoding="utf8",
xml_declaration=True,
pretty_print=True)
print(bytes.decode(xml))
def openinghours(url):
conn = http.client.HTTPSConnection('www.lidl.nl')
conn.request('GET', url)
res = conn.getresponse()
if res.status != 200:
print(f'Failed to GET URL {url}, got status {res.status}')
return None
root = etree.HTML(res.read().decode())
div_list = root.xpath("//div[contains(@class, 'lirt-o-store-detail-card__openingHours-data')]")
if len(div_list) == 0:
print(f'Niets gevonden in de HTML-soep van Lidl op {url}')
return None
times = {'Mo': None, 'Tu': None, 'We': None, 'Th': None, 'Fr': None, 'Sa': None, 'Su': None}
for p in div_list[0]:
m = OP_HOURS_REGEX.match(p.text)
if m:
day = m.group(1)
if day == 'Ma':
times['Mo'] = m.group(2)
elif day == 'Di':
times['Tu'] = m.group(2)
elif day == 'Wo':
times['We'] = m.group(2)
elif day == 'Do':
times['Th'] = m.group(2)
elif day == 'Vr':
times['Fr'] = m.group(2)
elif day == 'Za':
times['Sa'] = m.group(2)
elif day == 'Zo':
times['Su'] = m.group(2)
hours = []
last = None
range_start = None
range_end = None
range_len = 0
for d in times.keys():
if not last:
range_start = d
range_end = d
last = times[d]
range_len = 1
elif times[d] != last:
if range_len == 1:
hours.append(f'{range_start} {last}')
elif range_len == 2:
hours.append(f'{range_start},{range_end} {last}')
else:
hours.append(f'{range_start}-{range_end} {last}')
last = times[d]
if last:
range_start = d
range_len = 1
else:
range_start = None
range_len = 0
else:
range_end = d
range_len += 1
if range_len == 1:
hours.append(f'{range_start} {last}')
elif range_len == 2:
hours.append(f'{range_start},{range_end} {last}')
elif range_len > 2:
hours.append(f'{range_start}-{range_end} {last}')
return '; '.join(hours)
def is_lidl(element):
is_shop = False
is_lidl = False
ref = None
for tag in element:
key = tag.get('k')
value = tag.get('v')
if key == 'shop' and value == 'supermarket':
is_shop = True
continue
if key == 'brand' and value == 'Lidl':
is_lidl = True
continue
return is_shop and is_lidl
def fix_tags(element):
for tag in element:
key = tag.get('k')
if key == 'addr:city':
place = unidecode(tag.get('v')).lower().replace(' ', '-').replace('\'', '')
elif key == 'addr:street':
street = unidecode(tag.get('v')).lower().replace(' ', '-').replace('\'', '').replace('.', '-').replace('--', '-')
elif key == 'addr:housenumber':
housenumber = tag.get('v').lower().replace(' ', '-')
elif key == 'addr:postcode':
postcode = tag.get('v').lower()
# Fix the names which don't match.
if place == 's-gravenhage':
place = 'den-haag'
elif place == 's-hertogenbosch':
# Zijn ze in Oeteldonk geen fan van geloof ik...
place = 'den-bosch'
elif postcode == '4904aw':
# Aargh!!
place = 'oosterhout-nb'
elif postcode == '6678aa':
# Aargh!!
place = 'oosterhout-gld'
elif place == 'damwald':
place = 'damw%C3%A2ld'
if street == 'handelplein':
street = 'haendelplein'
elif street == 'tjalk-15':
# Lelystad...
housenumber = '09'
elif street == 'walddyk':
street = 'w%C3%A2lddyk'
elif street == 'mosaique':
street = 'mosa%C3%AFque'
elif street == 'burgemeester-van-loonstraat':
street = 'burg-van-loonstraat'
elif street == 'daniel-goedkoopstraat':
street = 'dani%C3%ABl-goedkoopstraat'
elif street == 'burgemeester-van-hooffln':
street = 'burgemeester-van-hoofflaan'
elif street == 'burgemeester-baumannlaan':
street = 'burg-baumannlaan'
elif street == 'elias-annes-borgerstraat':
street = 'e-a-borgerstraat'
elif street == 'van-oldenbarneveltplein':
street = 'oldenbarneveltplein'
elif street == 'otto-c-huismanstraat':
street = 'o-c-huismanstraat'
elif street == 'hammarskjoldhof':
street = 'hammarskjoeldhof'
elif street == 'doctor-huber-noodtstraat':
street = 'dr-huber-noodtstraat'
elif street == 'monseigneur-borretweg':
street = 'mgr-borretweg'
elif street == 'sint-wirosingel':
street = 'st-wirosingel'
elif street == 'e-d-s-plein':
# Ed heeft een eigen plein:
street = 'eds-plein'
url = f'https://www.lidl.nl/s/nl-NL/filialen/{place}/{street}-{housenumber}/'
# Ignore temporary closed:
if url == 'https://www.lidl.nl/s/nl-NL/filialen/utrecht/verlengde-houtrakgracht-383/':
return
elif url == 'https://www.lidl.nl/s/nl-NL/filialen/julianadorp/schoolweg-20/':
return
elif url == 'https://www.lidl.nl/s/nl-NL/filialen/gemert/grootmeestersstraat-9/':
return
elif url == 'https://www.lidl.nl/s/nl-NL/filialen/barneveld/oldenbarnevelderweg-155/':
return
hours = openinghours(url)
if not hours:
return
for tag in element:
key = tag.get('k')
if key in ('website', 'brand:website', 'brand:wikipedia', 'brand:wikidata', 'operator', 'opening_hours', 'addr:country', 'check_date:opening_hours'):
element.remove(tag)
website_tag = etree.Element("tag", attrib={'k':'website', 'v':url})
brand_website_tag = etree.Element("tag", attrib={'k':'brand:website', 'v':f'https://www.lidl.nl/'})
brand_wikipedia_tag = etree.Element("tag", attrib={'k':'brand:wikipedia', 'v':f'nl:Lidl'})
brand_wikidata_tag = etree.Element("tag", attrib={'k':'brand:wikidata', 'v':'Q151954'})
operator_tag = etree.Element("tag", attrib={'k':'operator', 'v':f'Lidl Nederland GmbH'})
opening_hours_tag = etree.Element("tag", attrib={'k':'opening_hours', 'v':hours})
element.append(website_tag)
element.append(brand_website_tag)
element.append(brand_wikipedia_tag)
element.append(brand_wikidata_tag)
element.append(operator_tag)
element.append(opening_hours_tag)
element.set('action', 'modify')
def test_url(url):
conn = http.client.HTTPSConnection('www.lidl.nl')
conn.request('GET', url)
res = conn.getresponse()
if res.status != 200:
print(f'{res.status}: {url}')
return False
return True
if __name__ == '__main__':
parse_command()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment