Created
January 31, 2025 09:24
-
-
Save jdhoek/0a9f04b1b70ba7e18c08cfb5c42b94f9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
import http.client | |
import re | |
from lxml import etree | |
from unidecode import unidecode | |
from datetime import datetime | |
OP_HOURS_REGEX = re.compile('(Ma|Di|Wo|Do|Vr|Za|Zo)\s+([0-9]{2}:[0-9]{2}-[0-9]{2}:[0-9]{2})') | |
TODAY = datetime.today().strftime('%Y-%m-%d') | |
def show_help(): | |
print("{} analyze INPUT".format(sys.argv[0])) | |
print("{} convert INPUT > OUTPUT".format(sys.argv[0])) | |
print("{} openinghours URL".format(sys.argv[0])) | |
def parse_command(): | |
if len(sys.argv) < 3: | |
show_help() | |
sys.exit(0) | |
if sys.argv[1] == "analyze": | |
analyze(sys.argv[2]) | |
elif sys.argv[1] == "convert": | |
convert(sys.argv[2]) | |
elif sys.argv[1] == "openinghours": | |
print(openinghours(sys.argv[2])) | |
else: | |
show_help() | |
sys.exit(2) | |
def analyze(file_name): | |
error_objects = {} | |
with open(file_name) as input_file: | |
tree = etree.parse(input_file) | |
root = tree.getroot() | |
for element in root: | |
id = element.get('id') | |
osm_type = element.tag | |
if is_lidl(element): | |
required = ['addr:street', 'addr:postcode', 'addr:housenumber', 'addr:city'] | |
for tag in element: | |
key = tag.get('k') | |
if key in required: | |
required.remove(key) | |
if len(required) > 0: | |
if osm_type not in error_objects: | |
error_objects[osm_type] = [] | |
error_objects[osm_type].append(id) | |
print(f'Error in {id}, missing: {required}') | |
for osm_type, ids in error_objects.items(): | |
print(osm_type + ': ' + ', '.join(ids)) | |
def convert(file_name): | |
with open(file_name) as input_file: | |
tree = etree.parse(input_file) | |
root = tree.getroot() | |
for element in root: | |
if is_lidl(element): | |
fix_tags(element) | |
xml = etree.tostring(tree, | |
encoding="utf8", | |
xml_declaration=True, | |
pretty_print=True) | |
print(bytes.decode(xml)) | |
def openinghours(url): | |
conn = http.client.HTTPSConnection('www.lidl.nl') | |
conn.request('GET', url) | |
res = conn.getresponse() | |
if res.status != 200: | |
print(f'Failed to GET URL {url}, got status {res.status}') | |
return None | |
root = etree.HTML(res.read().decode()) | |
div_list = root.xpath("//div[contains(@class, 'lirt-o-store-detail-card__openingHours-data')]") | |
if len(div_list) == 0: | |
print(f'Niets gevonden in de HTML-soep van Lidl op {url}') | |
return None | |
times = {'Mo': None, 'Tu': None, 'We': None, 'Th': None, 'Fr': None, 'Sa': None, 'Su': None} | |
for p in div_list[0]: | |
m = OP_HOURS_REGEX.match(p.text) | |
if m: | |
day = m.group(1) | |
if day == 'Ma': | |
times['Mo'] = m.group(2) | |
elif day == 'Di': | |
times['Tu'] = m.group(2) | |
elif day == 'Wo': | |
times['We'] = m.group(2) | |
elif day == 'Do': | |
times['Th'] = m.group(2) | |
elif day == 'Vr': | |
times['Fr'] = m.group(2) | |
elif day == 'Za': | |
times['Sa'] = m.group(2) | |
elif day == 'Zo': | |
times['Su'] = m.group(2) | |
hours = [] | |
last = None | |
range_start = None | |
range_end = None | |
range_len = 0 | |
for d in times.keys(): | |
if not last: | |
range_start = d | |
range_end = d | |
last = times[d] | |
range_len = 1 | |
elif times[d] != last: | |
if range_len == 1: | |
hours.append(f'{range_start} {last}') | |
elif range_len == 2: | |
hours.append(f'{range_start},{range_end} {last}') | |
else: | |
hours.append(f'{range_start}-{range_end} {last}') | |
last = times[d] | |
if last: | |
range_start = d | |
range_len = 1 | |
else: | |
range_start = None | |
range_len = 0 | |
else: | |
range_end = d | |
range_len += 1 | |
if range_len == 1: | |
hours.append(f'{range_start} {last}') | |
elif range_len == 2: | |
hours.append(f'{range_start},{range_end} {last}') | |
elif range_len > 2: | |
hours.append(f'{range_start}-{range_end} {last}') | |
return '; '.join(hours) | |
def is_lidl(element): | |
is_shop = False | |
is_lidl = False | |
ref = None | |
for tag in element: | |
key = tag.get('k') | |
value = tag.get('v') | |
if key == 'shop' and value == 'supermarket': | |
is_shop = True | |
continue | |
if key == 'brand' and value == 'Lidl': | |
is_lidl = True | |
continue | |
return is_shop and is_lidl | |
def fix_tags(element): | |
for tag in element: | |
key = tag.get('k') | |
if key == 'addr:city': | |
place = unidecode(tag.get('v')).lower().replace(' ', '-').replace('\'', '') | |
elif key == 'addr:street': | |
street = unidecode(tag.get('v')).lower().replace(' ', '-').replace('\'', '').replace('.', '-').replace('--', '-') | |
elif key == 'addr:housenumber': | |
housenumber = tag.get('v').lower().replace(' ', '-') | |
elif key == 'addr:postcode': | |
postcode = tag.get('v').lower() | |
# Fix the names which don't match. | |
if place == 's-gravenhage': | |
place = 'den-haag' | |
elif place == 's-hertogenbosch': | |
# Zijn ze in Oeteldonk geen fan van geloof ik... | |
place = 'den-bosch' | |
elif postcode == '4904aw': | |
# Aargh!! | |
place = 'oosterhout-nb' | |
elif postcode == '6678aa': | |
# Aargh!! | |
place = 'oosterhout-gld' | |
elif place == 'damwald': | |
place = 'damw%C3%A2ld' | |
if street == 'handelplein': | |
street = 'haendelplein' | |
elif street == 'tjalk-15': | |
# Lelystad... | |
housenumber = '09' | |
elif street == 'walddyk': | |
street = 'w%C3%A2lddyk' | |
elif street == 'mosaique': | |
street = 'mosa%C3%AFque' | |
elif street == 'burgemeester-van-loonstraat': | |
street = 'burg-van-loonstraat' | |
elif street == 'daniel-goedkoopstraat': | |
street = 'dani%C3%ABl-goedkoopstraat' | |
elif street == 'burgemeester-van-hooffln': | |
street = 'burgemeester-van-hoofflaan' | |
elif street == 'burgemeester-baumannlaan': | |
street = 'burg-baumannlaan' | |
elif street == 'elias-annes-borgerstraat': | |
street = 'e-a-borgerstraat' | |
elif street == 'van-oldenbarneveltplein': | |
street = 'oldenbarneveltplein' | |
elif street == 'otto-c-huismanstraat': | |
street = 'o-c-huismanstraat' | |
elif street == 'hammarskjoldhof': | |
street = 'hammarskjoeldhof' | |
elif street == 'doctor-huber-noodtstraat': | |
street = 'dr-huber-noodtstraat' | |
elif street == 'monseigneur-borretweg': | |
street = 'mgr-borretweg' | |
elif street == 'sint-wirosingel': | |
street = 'st-wirosingel' | |
elif street == 'e-d-s-plein': | |
# Ed heeft een eigen plein: | |
street = 'eds-plein' | |
url = f'https://www.lidl.nl/s/nl-NL/filialen/{place}/{street}-{housenumber}/' | |
# Ignore temporary closed: | |
if url == 'https://www.lidl.nl/s/nl-NL/filialen/utrecht/verlengde-houtrakgracht-383/': | |
return | |
elif url == 'https://www.lidl.nl/s/nl-NL/filialen/julianadorp/schoolweg-20/': | |
return | |
elif url == 'https://www.lidl.nl/s/nl-NL/filialen/gemert/grootmeestersstraat-9/': | |
return | |
elif url == 'https://www.lidl.nl/s/nl-NL/filialen/barneveld/oldenbarnevelderweg-155/': | |
return | |
hours = openinghours(url) | |
if not hours: | |
return | |
for tag in element: | |
key = tag.get('k') | |
if key in ('website', 'brand:website', 'brand:wikipedia', 'brand:wikidata', 'operator', 'opening_hours', 'addr:country', 'check_date:opening_hours'): | |
element.remove(tag) | |
website_tag = etree.Element("tag", attrib={'k':'website', 'v':url}) | |
brand_website_tag = etree.Element("tag", attrib={'k':'brand:website', 'v':f'https://www.lidl.nl/'}) | |
brand_wikipedia_tag = etree.Element("tag", attrib={'k':'brand:wikipedia', 'v':f'nl:Lidl'}) | |
brand_wikidata_tag = etree.Element("tag", attrib={'k':'brand:wikidata', 'v':'Q151954'}) | |
operator_tag = etree.Element("tag", attrib={'k':'operator', 'v':f'Lidl Nederland GmbH'}) | |
opening_hours_tag = etree.Element("tag", attrib={'k':'opening_hours', 'v':hours}) | |
element.append(website_tag) | |
element.append(brand_website_tag) | |
element.append(brand_wikipedia_tag) | |
element.append(brand_wikidata_tag) | |
element.append(operator_tag) | |
element.append(opening_hours_tag) | |
element.set('action', 'modify') | |
def test_url(url): | |
conn = http.client.HTTPSConnection('www.lidl.nl') | |
conn.request('GET', url) | |
res = conn.getresponse() | |
if res.status != 200: | |
print(f'{res.status}: {url}') | |
return False | |
return True | |
if __name__ == '__main__': | |
parse_command() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment